From 78c0a0ba4b2093c2062a4b9336facece5da22f2e Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Tue, 21 May 2024 08:33:49 +0100 Subject: [PATCH] Don't start other roles we have a healthy container If a primary role container is unhealthy, we might take a while to timeout the health check poller. In the meantime if we have started the other roles, they'll be running tow containers. This could be a problem, especially if they read run jobs as that doubles the worker capacity which could cause exessive load. We'll wait for the first primary role container to boot successfully before starting the other containers from other roles. --- lib/kamal/cli/app/boot.rb | 41 ++++++++++++++++++++++++--------------- test/cli/app_test.rb | 4 ---- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index 5100e3cc..47f8c21c 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -34,6 +34,8 @@ class Kamal::Cli::App::Boot end def start_new_version + wait_at_barrier if queuer? + audit "Booted app version #{version}" execute *app.tie_cord(role.cord_host_file) if uses_cord? @@ -41,13 +43,10 @@ class Kamal::Cli::App::Boot execute *app.run(hostname: hostname) Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } - reach_barrier + release_barrier if gatekeeper? rescue => e - if barrier_role? && barrier&.close - info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})" - error capture_with_info(*app.logs(version: version)) - error capture_with_info(*app.container_health_log(version: version)) - end + close_barrier if gatekeeper? + execute *app.stop(version: version), raise_on_non_zero_exit: false raise @@ -67,19 +66,13 @@ class Kamal::Cli::App::Boot execute *app.clean_up_assets if assets? end - def reach_barrier - if barrier - if barrier_role? - if barrier.open - info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" - end - else - wait_for_barrier - end + def release_barrier + if barrier.open + info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" end end - def wait_for_barrier + def wait_at_barrier info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..." barrier.wait info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})" @@ -88,6 +81,14 @@ class Kamal::Cli::App::Boot raise end + def close_barrier + if barrier.close + info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})" + error capture_with_info(*app.logs(version: version)) + error capture_with_info(*app.container_health_log(version: version)) + end + end + def barrier_role? role == KAMAL.primary_role end @@ -103,4 +104,12 @@ class Kamal::Cli::App::Boot def audit(message) execute *auditor.record(message), verbosity: :debug end + + def gatekeeper? + barrier && barrier_role? + end + + def queuer? + barrier && !barrier_role? + end end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index b5b03e27..f684deb8 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -154,10 +154,6 @@ class CliAppTest < CliTestCase .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") .returns("unhealthy").at_least_once # web health check failing - SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") - .returns("running").at_least_once # workers health check passing - stderred do run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| assert_match "Waiting for a healthy web container (1.1.1.3)...", output