Zero downtime deployment with cord file

When replacing a container currently we:
1. Boot the new container
2. Wait for it to become healthy
3. Stop the old container

Traefik will send requests to the old container until it notices that it
is unhealthy. But it may have stopped serving requests before that point
which can result in errors.

To get round that the new boot process is:

1. Create a directory with a single file on the host
2. Boot the new container, mounting the cord file into /tmp and
including a check for the file in the docker healthcheck
3. Wait for it to become healthy
4. Delete the healthcheck file ("cut the cord") for the old container
5. Wait for it to become unhealthy and give Traefik a couple of seconds
to notice
6. Stop the old container

The extra steps ensure that Traefik stops sending requests before the
old container is shutdown.
This commit is contained in:
Donal McBreen
2023-08-31 10:21:57 +01:00
parent 94bf090657
commit 8a41d15b69
17 changed files with 234 additions and 65 deletions

View File

@@ -11,10 +11,11 @@ class CliAppTest < CliTestCase
end
test "boot will rename if same version is already running" do
Object.any_instance.stubs(:sleep)
run_command("details") # Preheat Kamal const
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.returns("12345678") # running version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
@@ -25,6 +26,14 @@ class CliAppTest < CliTestCase
.with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--filter", "status=restarting", "--latest", "--format", "\"{{.Names}}\"", "|", "while read line; do echo ${line#app-web-}; done", raise_on_non_zero_exit: false)
.returns("123") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", "app-web-123", "|", :awk, "'$2 == \"/tmp/kamal-cord\" {print $1}'", :raise_on_non_zero_exit => false)
.returns("cordfile") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy") # old version unhealthy
run_command("boot").tap do |output|
assert_match /Renaming container .* to .* as already deployed on 1.1.1.1/, output # Rename
assert_match /docker rename app-web-latest app-web-latest_replaced_[0-9a-f]{16}/, output
@@ -180,10 +189,16 @@ class CliAppTest < CliTestCase
end
def stub_running
Object.any_instance.stubs(:sleep)
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running") # health check
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy") # health check
end
end

View File

@@ -6,6 +6,7 @@ class CliHealthcheckTest < CliTestCase
Thread.report_on_exception = false
Kamal::Utils::HealthcheckPoller.stubs(:sleep) # No sleeping when retrying
Kamal::Configuration.any_instance.stubs(:run_id).returns("12345678901234567890123456789012")
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)

View File

@@ -176,9 +176,10 @@ class CliMainTest < CliTestCase
end
test "rollback good version" do
Object.any_instance.stubs(:sleep)
[ "web", "workers" ].each do |role|
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--filter", "name=^app-#{role}-123$", "--quiet", raise_on_non_zero_exit: false)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-#{role}-123$", "--quiet", raise_on_non_zero_exit: false)
.returns("").at_least_once
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-#{role}-123$", "--quiet")
@@ -191,14 +192,21 @@ class CliMainTest < CliTestCase
.returns("running").at_least_once # health check
end
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", "app-web-version-to-rollback", "|", :awk, "'$2 == \"/tmp/kamal-cord\" {print $1}'", :raise_on_non_zero_exit => false)
.returns("corddirectory").at_least_once # health check
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-version-to-rollback$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy").at_least_once # health check
Kamal::Commands::Hook.any_instance.stubs(:hook_exists?).returns(true)
hook_variables = { version: 123, service_version: "app@123", hosts: "1.1.1.1,1.1.1.2,1.1.1.3,1.1.1.4", command: "rollback" }
run_command("rollback", "123", config_file: "deploy_with_accessories").tap do |output|
assert_match "Start container with version 123", output
assert_hook_ran "pre-deploy", output, **hook_variables
assert_match "docker tag dhh/app:123 dhh/app:latest", output
assert_match "docker start app-web-123", output
assert_match "docker run --detach --restart unless-stopped --name app-web-123", output
assert_match "docker container ls --all --filter name=^app-web-version-to-rollback$ --quiet | xargs docker stop", output, "Should stop the container that was previously running"
assert_hook_ran "post-deploy", output, **hook_variables, runtime: "0"
end
@@ -210,7 +218,7 @@ class CliMainTest < CliTestCase
Kamal::Utils::HealthcheckPoller.stubs(:sleep)
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--filter", "name=^app-web-123$", "--quiet", raise_on_non_zero_exit: false)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", raise_on_non_zero_exit: false)
.returns("").at_least_once
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--filter", "status=restarting", "--latest", "--format", "\"{{.Names}}\"", "|", "while read line; do echo ${line#app-web-}; done", raise_on_non_zero_exit: false)
@@ -220,8 +228,7 @@ class CliMainTest < CliTestCase
.returns("running").at_least_once # health check
run_command("rollback", "123").tap do |output|
assert_match "Start container with version 123", output
assert_match "docker start app-web-123 || docker run --detach --restart unless-stopped --name app-web-123", output
assert_match "docker run --detach --restart unless-stopped --name app-web-123", output
assert_no_match "docker stop", output
end
end

View File

@@ -19,6 +19,8 @@ class CliTraefikTest < CliTestCase
end
test "reboot --rolling" do
Object.any_instance.stubs(:sleep)
run_command("reboot", "--rolling").tap do |output|
assert_match "Running docker container prune --force --filter label=org.opencontainers.image.title=Traefik on 1.1.1.1", output
end