Don't start other roles we have a healthy container
If a primary role container is unhealthy, we might take a while to timeout the health check poller. In the meantime if we have started the other roles, they'll be running tow containers. This could be a problem, especially if they read run jobs as that doubles the worker capacity which could cause exessive load. We'll wait for the first primary role container to boot successfully before starting the other containers from other roles.
This commit is contained in:
@@ -34,6 +34,8 @@ class Kamal::Cli::App::Boot
|
||||
end
|
||||
|
||||
def start_new_version
|
||||
wait_at_barrier if queuer?
|
||||
|
||||
audit "Booted app version #{version}"
|
||||
|
||||
execute *app.tie_cord(role.cord_host_file) if uses_cord?
|
||||
@@ -41,13 +43,10 @@ class Kamal::Cli::App::Boot
|
||||
execute *app.run(hostname: hostname)
|
||||
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
|
||||
|
||||
reach_barrier
|
||||
release_barrier if gatekeeper?
|
||||
rescue => e
|
||||
if barrier_role? && barrier&.close
|
||||
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
|
||||
error capture_with_info(*app.logs(version: version))
|
||||
error capture_with_info(*app.container_health_log(version: version))
|
||||
end
|
||||
close_barrier if gatekeeper?
|
||||
|
||||
execute *app.stop(version: version), raise_on_non_zero_exit: false
|
||||
|
||||
raise
|
||||
@@ -67,19 +66,13 @@ class Kamal::Cli::App::Boot
|
||||
execute *app.clean_up_assets if assets?
|
||||
end
|
||||
|
||||
def reach_barrier
|
||||
if barrier
|
||||
if barrier_role?
|
||||
def release_barrier
|
||||
if barrier.open
|
||||
info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})"
|
||||
end
|
||||
else
|
||||
wait_for_barrier
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def wait_for_barrier
|
||||
def wait_at_barrier
|
||||
info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..."
|
||||
barrier.wait
|
||||
info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})"
|
||||
@@ -88,6 +81,14 @@ class Kamal::Cli::App::Boot
|
||||
raise
|
||||
end
|
||||
|
||||
def close_barrier
|
||||
if barrier.close
|
||||
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
|
||||
error capture_with_info(*app.logs(version: version))
|
||||
error capture_with_info(*app.container_health_log(version: version))
|
||||
end
|
||||
end
|
||||
|
||||
def barrier_role?
|
||||
role == KAMAL.primary_role
|
||||
end
|
||||
@@ -103,4 +104,12 @@ class Kamal::Cli::App::Boot
|
||||
def audit(message)
|
||||
execute *auditor.record(message), verbosity: :debug
|
||||
end
|
||||
|
||||
def gatekeeper?
|
||||
barrier && barrier_role?
|
||||
end
|
||||
|
||||
def queuer?
|
||||
barrier && !barrier_role?
|
||||
end
|
||||
end
|
||||
|
||||
@@ -154,10 +154,6 @@ class CliAppTest < CliTestCase
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("unhealthy").at_least_once # web health check failing
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("running").at_least_once # workers health check passing
|
||||
|
||||
stderred do
|
||||
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
|
||||
assert_match "Waiting for a healthy web container (1.1.1.3)...", output
|
||||
|
||||
Reference in New Issue
Block a user