Remove the healthcheck step

To speed up deployments, we'll remove the healthcheck step.

This adds some risk to deployments for non-web roles - if they don't
have a Docker healthcheck configured then the only check we do is if
the container is running.

If there is a bad image we might see the container running before it
exits and deploy it. Previously the healthcheck step would have avoided
this by ensuring a web container could boot and serve traffic first.

To mitigate this, we'll add a deployment barrier. Until one of the
primary role containers passes its healthcheck, we'll keep the barrier
up and avoid stopping the containers on the non-primary roles.

It the primary role container fails its healthcheck, we'll close the
barrier and shut down the new containers on the waiting roles.

We also have a new integration test to check we correctly handle a
a broken image. This highlighted that SSHKit's default runner will
stop at the first error it encounters. We'll now have a custom runner
that waits for all threads to finish allowing them to clean up.
This commit is contained in:
Donal McBreen
2024-03-21 11:36:21 +00:00
parent 990f1b4413
commit 0efb5ccfff
24 changed files with 269 additions and 327 deletions

View File

@@ -0,0 +1,24 @@
require_relative "integration_test"
class BrokenDeployTest < IntegrationTest
test "deploying a bad image" do
@app = "app_with_roles"
kamal :envify
first_version = latest_app_version
kamal :deploy
assert_app_is_up version: first_version
assert_container_running host: :vm3, name: "app-workers-#{first_version}"
second_version = break_app
kamal :deploy, raise_on_error: false
assert_app_is_up version: first_version
assert_container_running host: :vm3, name: "app-workers-#{first_version}"
assert_container_not_running host: :vm3, name: "app-workers-#{second_version}"
end
end

View File

@@ -28,6 +28,7 @@ builder:
COMMIT_SHA: <%= `git rev-parse HEAD` %>
healthcheck:
cmd: wget -qO- http://localhost > /dev/null || exit 1
max_attempts: 3
traefik:
args:
accesslog: true
@@ -41,3 +42,4 @@ accessories:
roles:
- web
stop_wait_time: 1
readiness_delay: 0

View File

@@ -22,6 +22,7 @@ builder:
COMMIT_SHA: <%= `git rev-parse HEAD` %>
healthcheck:
cmd: wget -qO- http://localhost > /dev/null || exit 1
max_attempts: 3
traefik:
args:
accesslog: true
@@ -35,3 +36,4 @@ accessories:
roles:
- web
stop_wait_time: 1
readiness_delay: 0

View File

@@ -0,0 +1,3 @@
#!/bin/bash
cd $1 && echo "bad nginx config" > default.conf && git commit -am 'Broken'

View File

@@ -78,6 +78,11 @@ class IntegrationTest < ActiveSupport::TestCase
latest_app_version
end
def break_app
deployer_exec "./break_app.sh #{@app}", workdir: "/"
latest_app_version
end
def latest_app_version
deployer_exec("git rev-parse HEAD", capture: true)
end
@@ -131,4 +136,16 @@ class IntegrationTest < ActiveSupport::TestCase
puts "Tried to get the response code again and got #{app_response.code}"
end
end
def assert_container_running(host:, name:)
assert container_running?(host: host, name: name)
end
def assert_container_not_running(host:, name:)
assert_not container_running?(host: host, name: name)
end
def container_running?(host:, name:)
docker_compose("exec #{host} docker ps --filter=name=#{name} | tail -n+2", capture: true).tap { |x| p [ x, x.strip, x.strip.present? ] }.strip.present?
end
end

View File

@@ -56,6 +56,12 @@ class MainTest < IntegrationTest
assert_app_is_up version: version
assert_hooks_ran "pre-connect", "pre-build", "pre-deploy", "post-deploy"
assert_container_running host: :vm3, name: "app-workers-#{version}"
second_version = update_app_rev
kamal :redeploy
assert_app_is_up version: second_version
assert_container_running host: :vm3, name: "app-workers-#{second_version}"
end
test "config" do
@@ -73,7 +79,7 @@ class MainTest < IntegrationTest
assert_equal({ user: "root", port: 22, keepalive: true, keepalive_interval: 30, log_level: :fatal }, config[:ssh_options])
assert_equal({ "multiarch" => false, "args" => { "COMMIT_SHA" => version } }, config[:builder])
assert_equal [ "--log-opt", "max-size=\"10m\"" ], config[:logging]
assert_equal({ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord"=>"/tmp/kamal-cord", "log_lines" => 50, "cmd"=>"wget -qO- http://localhost > /dev/null || exit 1" }, config[:healthcheck])
assert_equal({ "path" => "/up", "port" => 3000, "max_attempts" => 3, "cord"=>"/tmp/kamal-cord", "log_lines" => 50, "cmd"=>"wget -qO- http://localhost > /dev/null || exit 1" }, config[:healthcheck])
end
test "setup and remove" do
@@ -157,8 +163,4 @@ class MainTest < IntegrationTest
assert vm1_image_ids.any?
assert vm1_container_ids.any?
end
def assert_container_running(host:, name:)
assert docker_compose("exec #{host} docker ps --filter=name=#{name} -q", capture: true).strip.present?
end
end