Remove the healthcheck step
To speed up deployments, we'll remove the healthcheck step. This adds some risk to deployments for non-web roles - if they don't have a Docker healthcheck configured then the only check we do is if the container is running. If there is a bad image we might see the container running before it exits and deploy it. Previously the healthcheck step would have avoided this by ensuring a web container could boot and serve traffic first. To mitigate this, we'll add a deployment barrier. Until one of the primary role containers passes its healthcheck, we'll keep the barrier up and avoid stopping the containers on the non-primary roles. It the primary role container fails its healthcheck, we'll close the barrier and shut down the new containers on the waiting roles. We also have a new integration test to check we correctly handle a a broken image. This highlighted that SSHKit's default runner will stop at the first error it encounters. We'll now have a custom runner that waits for all threads to finish allowing them to clean up.
This commit is contained in:
@@ -14,9 +14,12 @@ class Kamal::Cli::App < Kamal::Cli::Base
|
||||
end
|
||||
end
|
||||
|
||||
barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many?
|
||||
|
||||
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
|
||||
# Ensure primary role is booted first to allow the web barrier to be opened
|
||||
KAMAL.roles_on(host).each do |role|
|
||||
Kamal::Cli::App::Boot.new(host, role, version, self).run
|
||||
Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run
|
||||
end
|
||||
end
|
||||
|
||||
@@ -284,4 +287,8 @@ class Kamal::Cli::App < Kamal::Cli::Base
|
||||
def version_or_latest
|
||||
options[:version] || KAMAL.config.latest_tag
|
||||
end
|
||||
|
||||
def web_and_non_web_roles?
|
||||
KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
class Kamal::Cli::App::Boot
|
||||
attr_reader :host, :role, :version, :sshkit
|
||||
attr_reader :host, :role, :version, :barrier, :sshkit
|
||||
delegate :execute, :capture_with_info, :info, to: :sshkit
|
||||
delegate :uses_cord?, :assets?, to: :role
|
||||
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
|
||||
|
||||
def initialize(host, role, version, sshkit)
|
||||
def initialize(host, role, sshkit, version, barrier)
|
||||
@host = host
|
||||
@role = role
|
||||
@version = version
|
||||
@barrier = barrier
|
||||
@sshkit = sshkit
|
||||
end
|
||||
|
||||
@@ -46,10 +47,18 @@ class Kamal::Cli::App::Boot
|
||||
|
||||
def start_new_version
|
||||
audit "Booted app version #{version}"
|
||||
|
||||
execute *app.tie_cord(role.cord_host_file) if uses_cord?
|
||||
hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}"
|
||||
execute *app.run(hostname: hostname)
|
||||
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
|
||||
|
||||
reach_barrier
|
||||
rescue => e
|
||||
close_barrier if barrier_role?
|
||||
execute *app.stop(version: version), raise_on_non_zero_exit: false
|
||||
|
||||
raise
|
||||
end
|
||||
|
||||
def stop_old_version(version)
|
||||
@@ -65,4 +74,45 @@ class Kamal::Cli::App::Boot
|
||||
|
||||
execute *app.clean_up_assets if assets?
|
||||
end
|
||||
|
||||
def reach_barrier
|
||||
if barrier
|
||||
if barrier_role?
|
||||
if barrier.open
|
||||
info "Opened barrier (#{host})"
|
||||
end
|
||||
else
|
||||
wait_for_barrier
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def wait_for_barrier
|
||||
info "Waiting at web barrier (#{host})..."
|
||||
barrier.wait
|
||||
info "Barrier opened (#{host})"
|
||||
rescue Kamal::Cli::Healthcheck::Error
|
||||
info "Barrier closed, shutting down new container... (#{host})"
|
||||
raise
|
||||
end
|
||||
|
||||
def close_barrier
|
||||
barrier&.close
|
||||
end
|
||||
|
||||
def barrier_role?
|
||||
role == KAMAL.primary_role
|
||||
end
|
||||
|
||||
def app
|
||||
@app ||= KAMAL.app(role: role)
|
||||
end
|
||||
|
||||
def auditor
|
||||
@auditor = KAMAL.auditor(role: role)
|
||||
end
|
||||
|
||||
def audit(message)
|
||||
execute *auditor.record(message), verbosity: :debug
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
class Kamal::Cli::Healthcheck < Kamal::Cli::Base
|
||||
default_command :perform
|
||||
|
||||
desc "perform", "Health check current app version"
|
||||
def perform
|
||||
raise "The primary host is not configured to run Traefik" unless KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
|
||||
on(KAMAL.primary_host) do
|
||||
begin
|
||||
execute *KAMAL.healthcheck.run
|
||||
Poller.wait_for_healthy { capture_with_info(*KAMAL.healthcheck.status) }
|
||||
rescue Poller::HealthcheckError => e
|
||||
error capture_with_info(*KAMAL.healthcheck.logs)
|
||||
error capture_with_pretty_json(*KAMAL.healthcheck.container_health_log)
|
||||
raise
|
||||
ensure
|
||||
execute *KAMAL.healthcheck.stop, raise_on_non_zero_exit: false
|
||||
execute *KAMAL.healthcheck.remove, raise_on_non_zero_exit: false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
31
lib/kamal/cli/healthcheck/barrier.rb
Normal file
31
lib/kamal/cli/healthcheck/barrier.rb
Normal file
@@ -0,0 +1,31 @@
|
||||
class Kamal::Cli::Healthcheck::Barrier
|
||||
def initialize
|
||||
@ivar = Concurrent::IVar.new
|
||||
end
|
||||
|
||||
def close
|
||||
set(false)
|
||||
end
|
||||
|
||||
def open
|
||||
set(true)
|
||||
end
|
||||
|
||||
def wait
|
||||
unless opened?
|
||||
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
def opened?
|
||||
@ivar.value
|
||||
end
|
||||
|
||||
def set(value)
|
||||
@ivar.set(value)
|
||||
true
|
||||
rescue Concurrent::MultipleAssignmentError
|
||||
false
|
||||
end
|
||||
end
|
||||
2
lib/kamal/cli/healthcheck/error.rb
Normal file
2
lib/kamal/cli/healthcheck/error.rb
Normal file
@@ -0,0 +1,2 @@
|
||||
class Kamal::Cli::Healthcheck::Error < StandardError
|
||||
end
|
||||
@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
|
||||
|
||||
TRAEFIK_UPDATE_DELAY = 5
|
||||
|
||||
class HealthcheckError < StandardError; end
|
||||
|
||||
def wait_for_healthy(pause_after_ready: false, &block)
|
||||
attempt = 1
|
||||
@@ -16,9 +15,9 @@ module Kamal::Cli::Healthcheck::Poller
|
||||
when "running" # No health check configured
|
||||
sleep KAMAL.config.readiness_delay if pause_after_ready
|
||||
else
|
||||
raise HealthcheckError, "container not ready (#{status})"
|
||||
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
|
||||
end
|
||||
rescue HealthcheckError => e
|
||||
rescue Kamal::Cli::Healthcheck::Error => e
|
||||
if attempt <= max_attempts
|
||||
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
@@ -41,9 +40,9 @@ module Kamal::Cli::Healthcheck::Poller
|
||||
when "unhealthy"
|
||||
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
|
||||
else
|
||||
raise HealthcheckError, "container not unhealthy (#{status})"
|
||||
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
|
||||
end
|
||||
rescue HealthcheckError => e
|
||||
rescue Kamal::Cli::Healthcheck::Error => e
|
||||
if attempt <= max_attempts
|
||||
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
|
||||
@@ -42,11 +42,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
say "Ensure Traefik is running...", :magenta
|
||||
invoke "kamal:cli:traefik:boot", [], invoke_options
|
||||
|
||||
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
|
||||
say "Ensure app can pass healthcheck...", :magenta
|
||||
invoke "kamal:cli:healthcheck:perform", [], invoke_options
|
||||
end
|
||||
|
||||
say "Detect stale containers...", :magenta
|
||||
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
|
||||
|
||||
@@ -77,9 +72,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
|
||||
run_hook "pre-deploy"
|
||||
|
||||
say "Ensure app can pass healthcheck...", :magenta
|
||||
invoke "kamal:cli:healthcheck:perform", [], invoke_options
|
||||
|
||||
say "Detect stale containers...", :magenta
|
||||
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
|
||||
|
||||
@@ -228,9 +220,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
desc "env", "Manage environment files"
|
||||
subcommand "env", Kamal::Cli::Env
|
||||
|
||||
desc "healthcheck", "Healthcheck application"
|
||||
subcommand "healthcheck", Kamal::Cli::Healthcheck
|
||||
|
||||
desc "lock", "Manage the deploy lock"
|
||||
subcommand "lock", Kamal::Cli::Lock
|
||||
|
||||
|
||||
Reference in New Issue
Block a user