Don't start other roles we have a healthy container

If a primary role container is unhealthy, we might take a while to
timeout the health check poller. In the meantime if we have started the
other roles, they'll be running tow containers.

This could be a problem, especially if they read run jobs as that
doubles the worker capacity which could cause exessive load.

We'll wait for the first primary role container to boot successfully
before starting the other containers from other roles.
This commit is contained in:
Donal McBreen
2024-05-21 08:33:49 +01:00
parent ee758d951a
commit 78c0a0ba4b
2 changed files with 25 additions and 20 deletions

View File

@@ -34,6 +34,8 @@ class Kamal::Cli::App::Boot
end
def start_new_version
wait_at_barrier if queuer?
audit "Booted app version #{version}"
execute *app.tie_cord(role.cord_host_file) if uses_cord?
@@ -41,13 +43,10 @@ class Kamal::Cli::App::Boot
execute *app.run(hostname: hostname)
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
reach_barrier
release_barrier if gatekeeper?
rescue => e
if barrier_role? && barrier&.close
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
error capture_with_info(*app.logs(version: version))
error capture_with_info(*app.container_health_log(version: version))
end
close_barrier if gatekeeper?
execute *app.stop(version: version), raise_on_non_zero_exit: false
raise
@@ -67,19 +66,13 @@ class Kamal::Cli::App::Boot
execute *app.clean_up_assets if assets?
end
def reach_barrier
if barrier
if barrier_role?
def release_barrier
if barrier.open
info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})"
end
else
wait_for_barrier
end
end
end
def wait_for_barrier
def wait_at_barrier
info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..."
barrier.wait
info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})"
@@ -88,6 +81,14 @@ class Kamal::Cli::App::Boot
raise
end
def close_barrier
if barrier.close
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
error capture_with_info(*app.logs(version: version))
error capture_with_info(*app.container_health_log(version: version))
end
end
def barrier_role?
role == KAMAL.primary_role
end
@@ -103,4 +104,12 @@ class Kamal::Cli::App::Boot
def audit(message)
execute *auditor.record(message), verbosity: :debug
end
def gatekeeper?
barrier && barrier_role?
end
def queuer?
barrier && !barrier_role?
end
end

View File

@@ -154,10 +154,6 @@ class CliAppTest < CliTestCase
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy").at_least_once # web health check failing
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running").at_least_once # workers health check passing
stderred do
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
assert_match "Waiting for a healthy web container (1.1.1.3)...", output