Remove the healthcheck step

To speed up deployments, we'll remove the healthcheck step.

This adds some risk to deployments for non-web roles - if they don't
have a Docker healthcheck configured then the only check we do is if
the container is running.

If there is a bad image we might see the container running before it
exits and deploy it. Previously the healthcheck step would have avoided
this by ensuring a web container could boot and serve traffic first.

To mitigate this, we'll add a deployment barrier. Until one of the
primary role containers passes its healthcheck, we'll keep the barrier
up and avoid stopping the containers on the non-primary roles.

It the primary role container fails its healthcheck, we'll close the
barrier and shut down the new containers on the waiting roles.

We also have a new integration test to check we correctly handle a
a broken image. This highlighted that SSHKit's default runner will
stop at the first error it encounters. We'll now have a custom runner
that waits for all threads to finish allowing them to clean up.
This commit is contained in:
Donal McBreen
2024-03-21 11:36:21 +00:00
parent 990f1b4413
commit 0efb5ccfff
24 changed files with 269 additions and 327 deletions

View File

@@ -14,9 +14,12 @@ class Kamal::Cli::App < Kamal::Cli::Base
end
end
barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many?
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
# Ensure primary role is booted first to allow the web barrier to be opened
KAMAL.roles_on(host).each do |role|
Kamal::Cli::App::Boot.new(host, role, version, self).run
Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run
end
end
@@ -284,4 +287,8 @@ class Kamal::Cli::App < Kamal::Cli::Base
def version_or_latest
options[:version] || KAMAL.config.latest_tag
end
def web_and_non_web_roles?
KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?)
end
end

View File

@@ -1,12 +1,13 @@
class Kamal::Cli::App::Boot
attr_reader :host, :role, :version, :sshkit
attr_reader :host, :role, :version, :barrier, :sshkit
delegate :execute, :capture_with_info, :info, to: :sshkit
delegate :uses_cord?, :assets?, to: :role
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
def initialize(host, role, version, sshkit)
def initialize(host, role, sshkit, version, barrier)
@host = host
@role = role
@version = version
@barrier = barrier
@sshkit = sshkit
end
@@ -46,10 +47,18 @@ class Kamal::Cli::App::Boot
def start_new_version
audit "Booted app version #{version}"
execute *app.tie_cord(role.cord_host_file) if uses_cord?
hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}"
execute *app.run(hostname: hostname)
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
reach_barrier
rescue => e
close_barrier if barrier_role?
execute *app.stop(version: version), raise_on_non_zero_exit: false
raise
end
def stop_old_version(version)
@@ -65,4 +74,45 @@ class Kamal::Cli::App::Boot
execute *app.clean_up_assets if assets?
end
def reach_barrier
if barrier
if barrier_role?
if barrier.open
info "Opened barrier (#{host})"
end
else
wait_for_barrier
end
end
end
def wait_for_barrier
info "Waiting at web barrier (#{host})..."
barrier.wait
info "Barrier opened (#{host})"
rescue Kamal::Cli::Healthcheck::Error
info "Barrier closed, shutting down new container... (#{host})"
raise
end
def close_barrier
barrier&.close
end
def barrier_role?
role == KAMAL.primary_role
end
def app
@app ||= KAMAL.app(role: role)
end
def auditor
@auditor = KAMAL.auditor(role: role)
end
def audit(message)
execute *auditor.record(message), verbosity: :debug
end
end

View File

@@ -1,21 +0,0 @@
class Kamal::Cli::Healthcheck < Kamal::Cli::Base
default_command :perform
desc "perform", "Health check current app version"
def perform
raise "The primary host is not configured to run Traefik" unless KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
on(KAMAL.primary_host) do
begin
execute *KAMAL.healthcheck.run
Poller.wait_for_healthy { capture_with_info(*KAMAL.healthcheck.status) }
rescue Poller::HealthcheckError => e
error capture_with_info(*KAMAL.healthcheck.logs)
error capture_with_pretty_json(*KAMAL.healthcheck.container_health_log)
raise
ensure
execute *KAMAL.healthcheck.stop, raise_on_non_zero_exit: false
execute *KAMAL.healthcheck.remove, raise_on_non_zero_exit: false
end
end
end
end

View File

@@ -0,0 +1,31 @@
class Kamal::Cli::Healthcheck::Barrier
def initialize
@ivar = Concurrent::IVar.new
end
def close
set(false)
end
def open
set(true)
end
def wait
unless opened?
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
end
end
private
def opened?
@ivar.value
end
def set(value)
@ivar.set(value)
true
rescue Concurrent::MultipleAssignmentError
false
end
end

View File

@@ -0,0 +1,2 @@
class Kamal::Cli::Healthcheck::Error < StandardError
end

View File

@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
TRAEFIK_UPDATE_DELAY = 5
class HealthcheckError < StandardError; end
def wait_for_healthy(pause_after_ready: false, &block)
attempt = 1
@@ -16,9 +15,9 @@ module Kamal::Cli::Healthcheck::Poller
when "running" # No health check configured
sleep KAMAL.config.readiness_delay if pause_after_ready
else
raise HealthcheckError, "container not ready (#{status})"
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
end
rescue HealthcheckError => e
rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
@@ -41,9 +40,9 @@ module Kamal::Cli::Healthcheck::Poller
when "unhealthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
else
raise HealthcheckError, "container not unhealthy (#{status})"
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
end
rescue HealthcheckError => e
rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt

View File

@@ -42,11 +42,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
say "Ensure Traefik is running...", :magenta
invoke "kamal:cli:traefik:boot", [], invoke_options
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
say "Ensure app can pass healthcheck...", :magenta
invoke "kamal:cli:healthcheck:perform", [], invoke_options
end
say "Detect stale containers...", :magenta
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
@@ -77,9 +72,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
run_hook "pre-deploy"
say "Ensure app can pass healthcheck...", :magenta
invoke "kamal:cli:healthcheck:perform", [], invoke_options
say "Detect stale containers...", :magenta
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
@@ -228,9 +220,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
desc "env", "Manage environment files"
subcommand "env", Kamal::Cli::Env
desc "healthcheck", "Healthcheck application"
subcommand "healthcheck", Kamal::Cli::Healthcheck
desc "lock", "Manage the deploy lock"
subcommand "lock", Kamal::Cli::Lock

View File

@@ -150,6 +150,7 @@ class Kamal::Commander
sshkit.max_concurrent_starts = config.sshkit.max_concurrent_starts
sshkit.ssh_options = config.ssh.options
end
SSHKit.config.default_runner = SSHKit::Runner::ParallelCompleteAll
SSHKit.config.command_map[:docker] = "docker" # No need to use /usr/bin/env, just clogs up the logs
SSHKit.config.output_verbosity = verbosity
end

View File

@@ -1,59 +0,0 @@
class Kamal::Commands::Healthcheck < Kamal::Commands::Base
def run
primary = config.role(config.primary_role)
docker :run,
"--detach",
"--name", container_name_with_version,
"--publish", "#{exposed_port}:#{config.healthcheck["port"]}",
"--label", "service=#{config.healthcheck_service}",
"-e", "KAMAL_CONTAINER_NAME=\"#{config.healthcheck_service}\"",
*primary.env_args(config.primary_host),
*primary.health_check_args(cord: false),
*config.volume_args,
*primary.option_args,
config.absolute_image,
primary.cmd
end
def status
pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT))
end
def container_health_log
pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT))
end
def logs
pipe container_id, xargs(docker(:logs, "--tail", log_lines, "2>&1"))
end
def stop
pipe container_id, xargs(docker(:stop))
end
def remove
pipe container_id, xargs(docker(:container, :rm))
end
private
def container_name_with_version
"#{config.healthcheck_service}-#{config.version}"
end
def container_id
container_id_for(container_name: container_name_with_version)
end
def health_url
"http://localhost:#{exposed_port}#{config.healthcheck["path"]}"
end
def exposed_port
config.healthcheck["exposed_port"]
end
def log_lines
config.healthcheck["log_lines"]
end
end

View File

@@ -188,7 +188,7 @@ class Kamal::Configuration
def healthcheck
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
end
def healthcheck_service

View File

@@ -103,3 +103,34 @@ class SSHKit::Backend::Netssh
prepend LimitConcurrentStartsInstance
end
require "thread"
module SSHKit
module Runner
class ParallelCompleteAll < Abstract
def execute
threads = hosts.map do |host|
Thread.new(host) do |h|
begin
backend(h, &block).run
rescue ::StandardError => e
e2 = SSHKit::Runner::ExecuteError.new e
raise e2, "Exception while executing #{host.user ? "as #{host.user}@" : "on host "}#{host}: #{e.message}"
end
end
end
exception = nil
threads.each do |t|
begin
t.join
rescue SSHKit::Runner::ExecuteError => e
exception ||= e
end
end
raise exception if exception
end
end
end
end