Don't start other roles we have a healthy container

If a primary role container is unhealthy, we might take a while to
timeout the health check poller. In the meantime if we have started the
other roles, they'll be running tow containers.

This could be a problem, especially if they read run jobs as that
doubles the worker capacity which could cause exessive load.

We'll wait for the first primary role container to boot successfully
before starting the other containers from other roles.
This commit is contained in:
Donal McBreen
2024-05-21 08:33:49 +01:00
parent ee758d951a
commit 78c0a0ba4b
2 changed files with 25 additions and 20 deletions

View File

@@ -34,6 +34,8 @@ class Kamal::Cli::App::Boot
end end
def start_new_version def start_new_version
wait_at_barrier if queuer?
audit "Booted app version #{version}" audit "Booted app version #{version}"
execute *app.tie_cord(role.cord_host_file) if uses_cord? execute *app.tie_cord(role.cord_host_file) if uses_cord?
@@ -41,13 +43,10 @@ class Kamal::Cli::App::Boot
execute *app.run(hostname: hostname) execute *app.run(hostname: hostname)
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
reach_barrier release_barrier if gatekeeper?
rescue => e rescue => e
if barrier_role? && barrier&.close close_barrier if gatekeeper?
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
error capture_with_info(*app.logs(version: version))
error capture_with_info(*app.container_health_log(version: version))
end
execute *app.stop(version: version), raise_on_non_zero_exit: false execute *app.stop(version: version), raise_on_non_zero_exit: false
raise raise
@@ -67,19 +66,13 @@ class Kamal::Cli::App::Boot
execute *app.clean_up_assets if assets? execute *app.clean_up_assets if assets?
end end
def reach_barrier def release_barrier
if barrier
if barrier_role?
if barrier.open if barrier.open
info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})"
end end
else
wait_for_barrier
end
end
end end
def wait_for_barrier def wait_at_barrier
info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..." info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..."
barrier.wait barrier.wait
info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})" info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})"
@@ -88,6 +81,14 @@ class Kamal::Cli::App::Boot
raise raise
end end
def close_barrier
if barrier.close
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
error capture_with_info(*app.logs(version: version))
error capture_with_info(*app.container_health_log(version: version))
end
end
def barrier_role? def barrier_role?
role == KAMAL.primary_role role == KAMAL.primary_role
end end
@@ -103,4 +104,12 @@ class Kamal::Cli::App::Boot
def audit(message) def audit(message)
execute *auditor.record(message), verbosity: :debug execute *auditor.record(message), verbosity: :debug
end end
def gatekeeper?
barrier && barrier_role?
end
def queuer?
barrier && !barrier_role?
end
end end

View File

@@ -154,10 +154,6 @@ class CliAppTest < CliTestCase
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy").at_least_once # web health check failing .returns("unhealthy").at_least_once # web health check failing
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running").at_least_once # workers health check passing
stderred do stderred do
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
assert_match "Waiting for a healthy web container (1.1.1.3)...", output assert_match "Waiting for a healthy web container (1.1.1.3)...", output