From 8b965b0a31f500fbe2a091b94644bab2c85cff63 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Thu, 12 Sep 2024 19:27:59 +0100 Subject: [PATCH] Handle polling without the healthcheck config --- lib/kamal/cli/app.rb | 2 +- lib/kamal/cli/app/boot.rb | 3 ++ lib/kamal/cli/healthcheck/poller.rb | 30 ++++++++----- lib/kamal/configuration.rb | 4 ++ .../configuration/docs/configuration.yml | 8 +++- test/cli/app_test.rb | 43 ++++++++++++++++++- test/fixtures/deploy_with_roles.yml | 1 + 7 files changed, 77 insertions(+), 14 deletions(-) diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index 7359fdde..195b5ec7 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -4,7 +4,7 @@ class Kamal::Cli::App < Kamal::Cli::Base with_lock do say "Get most recent version available as an image...", :magenta unless options[:version] using_version(version_or_latest) do |version| - say "Start container with version #{version} using a #{KAMAL.config.readiness_delay}s readiness delay (or reboot if already running)...", :magenta + say "Start container with version #{version} (or reboot if already running)...", :magenta # Assets are prepared in a separate step to ensure they are on all hosts before booting on(KAMAL.hosts) do diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index 74da5a56..6939825a 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -58,6 +58,9 @@ class Kamal::Cli::App::Boot else Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } end + rescue => e + error "Failed to boot #{role} on #{host}" + raise e end def stop_new_version diff --git a/lib/kamal/cli/healthcheck/poller.rb b/lib/kamal/cli/healthcheck/poller.rb index fd8714ce..fad51845 100644 --- a/lib/kamal/cli/healthcheck/poller.rb +++ b/lib/kamal/cli/healthcheck/poller.rb @@ -1,22 +1,30 @@ module Kamal::Cli::Healthcheck::Poller extend self - def wait_for_healthy(pause_after_ready: false, &block) + def wait_for_healthy(role, &block) attempt = 1 - max_attempts = 7 + timeout_at = Time.now + KAMAL.config.readiness_timeout + readiness_delay = KAMAL.config.readiness_delay begin - case status = block.call - when "healthy" - when "running" # No health check configured - sleep KAMAL.config.readiness_delay if pause_after_ready - else - raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})" + status = block.call + + if status == "running" + # Wait for the readiness delay and confirm it is still running + if readiness_delay > 0 + info "Container is running, waiting for readiness delay of #{readiness_delay} seconds" + sleep readiness_delay + status = block.call + end + end + + unless %w[ running healthy ].include?(status) + raise Kamal::Cli::Healthcheck::Error, "container not ready after #{KAMAL.config.readiness_timeout} seconds (#{status})" end rescue Kamal::Cli::Healthcheck::Error => e - if attempt <= max_attempts - info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..." - sleep attempt + time_left = timeout_at - Time.now + if time_left > 0 + sleep [ attempt, time_left ].min attempt += 1 retry else diff --git a/lib/kamal/configuration.rb b/lib/kamal/configuration.rb index 1d6a445e..bd33b08c 100644 --- a/lib/kamal/configuration.rb +++ b/lib/kamal/configuration.rb @@ -189,6 +189,10 @@ class Kamal::Configuration raw_config.readiness_delay || 7 end + def readiness_timeout + raw_config.readiness_timeout || 30 + end + def run_directory ".kamal" diff --git a/lib/kamal/configuration/docs/configuration.yml b/lib/kamal/configuration/docs/configuration.yml index 44910fda..2e7b618d 100644 --- a/lib/kamal/configuration/docs/configuration.yml +++ b/lib/kamal/configuration/docs/configuration.yml @@ -111,9 +111,15 @@ minimum_version: 1.3.0 # Readiness delay # # Seconds to wait for a container to boot after is running, default 7 -# This only applies to containers that do not specify a healthcheck +# This only applies to containers that do not run a proxy or specify a healthcheck readiness_delay: 4 +# Readiness timeout +# +# How long to wait for a container to become ready, default 30 +# This only applies to containers that do not run a proxy +readiness_timeout: 4 + # Run directory # # Directory to store kamal runtime files in on the host, default `.kamal` diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index 6c37d620..5eaf77a6 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -116,7 +116,7 @@ class CliAppTest < CliTestCase assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output assert_match "First web container is healthy, booting workers on 1.1.1.3", output assert_match "First web container is healthy, booting workers on 1.1.1.4", output - end + end end test "boot with web barrier closed" do @@ -144,6 +144,47 @@ class CliAppTest < CliTestCase Thread.report_on_exception = true end + test "boot with worker errors" do + Thread.report_on_exception = false + + Object.any_instance.stubs(:sleep) + + SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("unhealthy").at_least_once # workers health check + + run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.3...", output + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output + assert_match "First web container is healthy, booting workers on 1.1.1.3", output + assert_match "First web container is healthy, booting workers on 1.1.1.4", output + assert_match "ERROR Failed to boot workers on 1.1.1.3", output + assert_match "ERROR Failed to boot workers on 1.1.1.4", output + end + ensure + Thread.report_on_exception = true + end + + test "boot with worker ready then not" do + Thread.report_on_exception = false + + Object.any_instance.stubs(:sleep) + + SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("running", "stopped").at_least_once # workers health check + + run_command("boot", config: :with_roles, host: "1.1.1.3", allow_execute_error: true).tap do |output| + assert_match "ERROR Failed to boot workers on 1.1.1.3", output + end + ensure + Thread.report_on_exception = true + end + test "start" do SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("999") # old version diff --git a/test/fixtures/deploy_with_roles.yml b/test/fixtures/deploy_with_roles.yml index 1eb8cc5c..d831a5b4 100644 --- a/test/fixtures/deploy_with_roles.yml +++ b/test/fixtures/deploy_with_roles.yml @@ -16,3 +16,4 @@ registry: password: pw builder: arch: amd64 +readiness_timeout: 1