Handle polling without the healthcheck config

This commit is contained in:
Donal McBreen
2024-09-12 19:27:59 +01:00
parent d2672c771e
commit 8b965b0a31
7 changed files with 77 additions and 14 deletions

View File

@@ -4,7 +4,7 @@ class Kamal::Cli::App < Kamal::Cli::Base
with_lock do with_lock do
say "Get most recent version available as an image...", :magenta unless options[:version] say "Get most recent version available as an image...", :magenta unless options[:version]
using_version(version_or_latest) do |version| using_version(version_or_latest) do |version|
say "Start container with version #{version} using a #{KAMAL.config.readiness_delay}s readiness delay (or reboot if already running)...", :magenta say "Start container with version #{version} (or reboot if already running)...", :magenta
# Assets are prepared in a separate step to ensure they are on all hosts before booting # Assets are prepared in a separate step to ensure they are on all hosts before booting
on(KAMAL.hosts) do on(KAMAL.hosts) do

View File

@@ -58,6 +58,9 @@ class Kamal::Cli::App::Boot
else else
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
end end
rescue => e
error "Failed to boot #{role} on #{host}"
raise e
end end
def stop_new_version def stop_new_version

View File

@@ -1,22 +1,30 @@
module Kamal::Cli::Healthcheck::Poller module Kamal::Cli::Healthcheck::Poller
extend self extend self
def wait_for_healthy(pause_after_ready: false, &block) def wait_for_healthy(role, &block)
attempt = 1 attempt = 1
max_attempts = 7 timeout_at = Time.now + KAMAL.config.readiness_timeout
readiness_delay = KAMAL.config.readiness_delay
begin begin
case status = block.call status = block.call
when "healthy"
when "running" # No health check configured if status == "running"
sleep KAMAL.config.readiness_delay if pause_after_ready # Wait for the readiness delay and confirm it is still running
else if readiness_delay > 0
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})" info "Container is running, waiting for readiness delay of #{readiness_delay} seconds"
sleep readiness_delay
status = block.call
end
end
unless %w[ running healthy ].include?(status)
raise Kamal::Cli::Healthcheck::Error, "container not ready after #{KAMAL.config.readiness_timeout} seconds (#{status})"
end end
rescue Kamal::Cli::Healthcheck::Error => e rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts time_left = timeout_at - Time.now
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..." if time_left > 0
sleep attempt sleep [ attempt, time_left ].min
attempt += 1 attempt += 1
retry retry
else else

View File

@@ -189,6 +189,10 @@ class Kamal::Configuration
raw_config.readiness_delay || 7 raw_config.readiness_delay || 7
end end
def readiness_timeout
raw_config.readiness_timeout || 30
end
def run_directory def run_directory
".kamal" ".kamal"

View File

@@ -111,9 +111,15 @@ minimum_version: 1.3.0
# Readiness delay # Readiness delay
# #
# Seconds to wait for a container to boot after is running, default 7 # Seconds to wait for a container to boot after is running, default 7
# This only applies to containers that do not specify a healthcheck # This only applies to containers that do not run a proxy or specify a healthcheck
readiness_delay: 4 readiness_delay: 4
# Readiness timeout
#
# How long to wait for a container to become ready, default 30
# This only applies to containers that do not run a proxy
readiness_timeout: 4
# Run directory # Run directory
# #
# Directory to store kamal runtime files in on the host, default `.kamal` # Directory to store kamal runtime files in on the host, default `.kamal`

View File

@@ -144,6 +144,47 @@ class CliAppTest < CliTestCase
Thread.report_on_exception = true Thread.report_on_exception = true
end end
test "boot with worker errors" do
Thread.report_on_exception = false
Object.any_instance.stubs(:sleep)
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy").at_least_once # workers health check
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.3...", output
assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output
assert_match "First web container is healthy, booting workers on 1.1.1.3", output
assert_match "First web container is healthy, booting workers on 1.1.1.4", output
assert_match "ERROR Failed to boot workers on 1.1.1.3", output
assert_match "ERROR Failed to boot workers on 1.1.1.4", output
end
ensure
Thread.report_on_exception = true
end
test "boot with worker ready then not" do
Thread.report_on_exception = false
Object.any_instance.stubs(:sleep)
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running", "stopped").at_least_once # workers health check
run_command("boot", config: :with_roles, host: "1.1.1.3", allow_execute_error: true).tap do |output|
assert_match "ERROR Failed to boot workers on 1.1.1.3", output
end
ensure
Thread.report_on_exception = true
end
test "start" do test "start" do
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("999") # old version SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("999") # old version

View File

@@ -16,3 +16,4 @@ registry:
password: pw password: pw
builder: builder:
arch: amd64 arch: amd64
readiness_timeout: 1