Remove the healthcheck step

To speed up deployments, we'll remove the healthcheck step.

This adds some risk to deployments for non-web roles - if they don't
have a Docker healthcheck configured then the only check we do is if
the container is running.

If there is a bad image we might see the container running before it
exits and deploy it. Previously the healthcheck step would have avoided
this by ensuring a web container could boot and serve traffic first.

To mitigate this, we'll add a deployment barrier. Until one of the
primary role containers passes its healthcheck, we'll keep the barrier
up and avoid stopping the containers on the non-primary roles.

It the primary role container fails its healthcheck, we'll close the
barrier and shut down the new containers on the waiting roles.

We also have a new integration test to check we correctly handle a
a broken image. This highlighted that SSHKit's default runner will
stop at the first error it encounters. We'll now have a custom runner
that waits for all threads to finish allowing them to clean up.
This commit is contained in:
Donal McBreen
2024-03-21 11:36:21 +00:00
parent 990f1b4413
commit 0efb5ccfff
24 changed files with 269 additions and 327 deletions

View File

@@ -118,6 +118,62 @@ class CliAppTest < CliTestCase
end
end
test "boot with web barrier opened" do
Object.any_instance.stubs(:sleep)
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running").at_least_once # web health check passing
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy").at_least_once # web health check failing
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running").at_least_once # workers health check
run_command("boot", config: :with_roles, host: nil).tap do |output|
assert_match "Waiting at web barrier (1.1.1.3)...", output
assert_match "Waiting at web barrier (1.1.1.4)...", output
assert_match "Barrier opened (1.1.1.3)", output
assert_match "Barrier opened (1.1.1.4)", output
end
end
test "boot with web barrier closed" do
Thread.report_on_exception = false
Object.any_instance.stubs(:sleep)
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy").at_least_once # web health check failing
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running").at_least_once # workers health check passing
stderred do
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
assert_match "Waiting at web barrier (1.1.1.3)...", output
assert_match "Waiting at web barrier (1.1.1.4)...", output
assert_match "Barrier closed, shutting down new container... (1.1.1.3)", output
assert_match "Barrier closed, shutting down new container... (1.1.1.4)", output
assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output
assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output
assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.3", output
assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.4", output
end
end
ensure
Thread.report_on_exception = true
end
test "start" do
run_command("start").tap do |output|
assert_match "docker start app-web-999", output
@@ -283,8 +339,12 @@ class CliAppTest < CliTestCase
end
private
def run_command(*command, config: :with_accessories)
stdouted { Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", "--hosts", "1.1.1.1" ]) }
def run_command(*command, config: :with_accessories, host: "1.1.1.1", allow_execute_error: false)
stdouted do
Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", *([ "--hosts", host ] if host) ])
rescue SSHKit::Runner::ExecuteError => e
raise e unless allow_execute_error
end
end
def stub_running