Remove the healthcheck step
To speed up deployments, we'll remove the healthcheck step. This adds some risk to deployments for non-web roles - if they don't have a Docker healthcheck configured then the only check we do is if the container is running. If there is a bad image we might see the container running before it exits and deploy it. Previously the healthcheck step would have avoided this by ensuring a web container could boot and serve traffic first. To mitigate this, we'll add a deployment barrier. Until one of the primary role containers passes its healthcheck, we'll keep the barrier up and avoid stopping the containers on the non-primary roles. It the primary role container fails its healthcheck, we'll close the barrier and shut down the new containers on the waiting roles. We also have a new integration test to check we correctly handle a a broken image. This highlighted that SSHKit's default runner will stop at the first error it encounters. We'll now have a custom runner that waits for all threads to finish allowing them to clean up.
This commit is contained in:
@@ -118,6 +118,62 @@ class CliAppTest < CliTestCase
|
||||
end
|
||||
end
|
||||
|
||||
test "boot with web barrier opened" do
|
||||
Object.any_instance.stubs(:sleep)
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("running").at_least_once # web health check passing
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("unhealthy").at_least_once # web health check failing
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("running").at_least_once # workers health check
|
||||
|
||||
run_command("boot", config: :with_roles, host: nil).tap do |output|
|
||||
assert_match "Waiting at web barrier (1.1.1.3)...", output
|
||||
assert_match "Waiting at web barrier (1.1.1.4)...", output
|
||||
assert_match "Barrier opened (1.1.1.3)", output
|
||||
assert_match "Barrier opened (1.1.1.4)", output
|
||||
end
|
||||
end
|
||||
|
||||
test "boot with web barrier closed" do
|
||||
Thread.report_on_exception = false
|
||||
|
||||
Object.any_instance.stubs(:sleep)
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("unhealthy").at_least_once # web health check failing
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("running").at_least_once # workers health check passing
|
||||
|
||||
stderred do
|
||||
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
|
||||
assert_match "Waiting at web barrier (1.1.1.3)...", output
|
||||
assert_match "Waiting at web barrier (1.1.1.4)...", output
|
||||
assert_match "Barrier closed, shutting down new container... (1.1.1.3)", output
|
||||
assert_match "Barrier closed, shutting down new container... (1.1.1.4)", output
|
||||
assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output
|
||||
assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output
|
||||
assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.3", output
|
||||
assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.4", output
|
||||
end
|
||||
end
|
||||
ensure
|
||||
Thread.report_on_exception = true
|
||||
end
|
||||
|
||||
test "start" do
|
||||
run_command("start").tap do |output|
|
||||
assert_match "docker start app-web-999", output
|
||||
@@ -283,8 +339,12 @@ class CliAppTest < CliTestCase
|
||||
end
|
||||
|
||||
private
|
||||
def run_command(*command, config: :with_accessories)
|
||||
stdouted { Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", "--hosts", "1.1.1.1" ]) }
|
||||
def run_command(*command, config: :with_accessories, host: "1.1.1.1", allow_execute_error: false)
|
||||
stdouted do
|
||||
Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", *([ "--hosts", host ] if host) ])
|
||||
rescue SSHKit::Runner::ExecuteError => e
|
||||
raise e unless allow_execute_error
|
||||
end
|
||||
end
|
||||
|
||||
def stub_running
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
require_relative "cli_test_case"
|
||||
|
||||
class CliHealthcheckTest < CliTestCase
|
||||
test "perform" do
|
||||
# Prevent expected failures from outputting to terminal
|
||||
Thread.report_on_exception = false
|
||||
|
||||
Kamal::Cli::Healthcheck::Poller.stubs(:sleep) # No sleeping when retrying
|
||||
Kamal::Configuration.any_instance.stubs(:run_id).returns("12345678901234567890123456789012")
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
|
||||
.with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "KAMAL_CONTAINER_NAME=\"healthcheck-app\"", "--env-file", ".kamal/env/roles/app-web.env", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999")
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false)
|
||||
|
||||
# Fail twice to test retry logic
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("starting")
|
||||
.then
|
||||
.returns("unhealthy")
|
||||
.then
|
||||
.returns("healthy")
|
||||
|
||||
run_command("perform").tap do |output|
|
||||
assert_match "container not ready (starting), retrying in 1s (attempt 1/7)...", output
|
||||
assert_match "container not ready (unhealthy), retrying in 2s (attempt 2/7)...", output
|
||||
assert_match "Container is healthy!", output
|
||||
end
|
||||
end
|
||||
|
||||
test "perform failing to become healthy" do
|
||||
# Prevent expected failures from outputting to terminal
|
||||
Thread.report_on_exception = false
|
||||
|
||||
Kamal::Cli::Healthcheck::Poller.stubs(:sleep) # No sleeping when retrying
|
||||
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
|
||||
.with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "KAMAL_CONTAINER_NAME=\"healthcheck-app\"", "--env-file", ".kamal/env/roles/app-web.env", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999")
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false)
|
||||
|
||||
# Continually report unhealthy
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
|
||||
.returns("unhealthy")
|
||||
|
||||
# Capture logs when failing
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :logs, "--tail", 50, "2>&1")
|
||||
.returns("some log output")
|
||||
|
||||
# Capture container health log when failing
|
||||
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_pretty_json)
|
||||
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{json .State.Health}}'")
|
||||
.returns('{"Status":"unhealthy","Log":[{"ExitCode": 1,"Output": "/bin/sh: 1: curl: not found\n"}]}"')
|
||||
|
||||
exception = assert_raises do
|
||||
run_command("perform")
|
||||
end
|
||||
assert_match "container not ready (unhealthy)", exception.message
|
||||
end
|
||||
|
||||
test "raises an exception if primary does not have traefik" do
|
||||
SSHKit::Backend::Abstract.any_instance.expects(:execute).never
|
||||
|
||||
exception = assert_raises do
|
||||
run_command("perform", config_file: "test/fixtures/deploy_workers_only.yml")
|
||||
end
|
||||
|
||||
assert_equal "The primary host is not configured to run Traefik", exception.message
|
||||
end
|
||||
|
||||
private
|
||||
def run_command(*command, config_file: "test/fixtures/deploy_with_accessories.yml")
|
||||
stdouted { Kamal::Cli::Healthcheck.start([ *command, "-c", config_file ]) }
|
||||
end
|
||||
end
|
||||
@@ -27,7 +27,6 @@ class CliMainTest < CliTestCase
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options)
|
||||
@@ -40,7 +39,6 @@ class CliMainTest < CliTestCase
|
||||
assert_match /Log into image registry/, output
|
||||
assert_match /Pull app image/, output
|
||||
assert_match /Ensure Traefik is running/, output
|
||||
assert_match /Ensure app can pass healthcheck/, output
|
||||
assert_match /Detect stale containers/, output
|
||||
assert_match /Prune old containers and images/, output
|
||||
assert_match /Releasing the deploy lock/, output
|
||||
@@ -53,7 +51,6 @@ class CliMainTest < CliTestCase
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options)
|
||||
@@ -67,7 +64,6 @@ class CliMainTest < CliTestCase
|
||||
assert_match /Build and push app image/, output
|
||||
assert_hook_ran "pre-deploy", output, **hook_variables
|
||||
assert_match /Ensure Traefik is running/, output
|
||||
assert_match /Ensure app can pass healthcheck/, output
|
||||
assert_match /Detect stale containers/, output
|
||||
assert_match /Prune old containers and images/, output
|
||||
assert_hook_ran "post-deploy", output, **hook_variables, runtime: true
|
||||
@@ -80,7 +76,6 @@ class CliMainTest < CliTestCase
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options)
|
||||
@@ -90,7 +85,6 @@ class CliMainTest < CliTestCase
|
||||
assert_match /Log into image registry/, output
|
||||
assert_match /Pull app image/, output
|
||||
assert_match /Ensure Traefik is running/, output
|
||||
assert_match /Ensure app can pass healthcheck/, output
|
||||
assert_match /Detect stale containers/, output
|
||||
assert_match /Prune old containers and images/, output
|
||||
assert_match /Releasing the deploy lock/, output
|
||||
@@ -156,7 +150,6 @@ class CliMainTest < CliTestCase
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options)
|
||||
@@ -187,7 +180,6 @@ class CliMainTest < CliTestCase
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options)
|
||||
@@ -199,7 +191,6 @@ class CliMainTest < CliTestCase
|
||||
invoke_options = { "config_file" => "test/fixtures/deploy_simple.yml", "version" => "999", "skip_hooks" => false, "verbose" => true }
|
||||
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
|
||||
@@ -212,7 +203,6 @@ class CliMainTest < CliTestCase
|
||||
assert_match /Build and push app image/, output
|
||||
assert_hook_ran "pre-deploy", output, **hook_variables
|
||||
assert_match /Running the pre-deploy hook.../, output
|
||||
assert_match /Ensure app can pass healthcheck/, output
|
||||
assert_hook_ran "post-deploy", output, **hook_variables, runtime: true
|
||||
end
|
||||
end
|
||||
@@ -221,13 +211,11 @@ class CliMainTest < CliTestCase
|
||||
invoke_options = { "config_file" => "test/fixtures/deploy_simple.yml", "version" => "999", "skip_hooks" => false }
|
||||
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options)
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true))
|
||||
Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options)
|
||||
|
||||
run_command("redeploy", "--skip_push").tap do |output|
|
||||
assert_match /Pull app image/, output
|
||||
assert_match /Ensure app can pass healthcheck/, output
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
Reference in New Issue
Block a user