Merge pull request #740 from basecamp/remove-healthcheck-step

Remove the healthcheck step
This commit is contained in:
Donal McBreen
2024-05-21 12:00:25 +01:00
committed by GitHub
26 changed files with 300 additions and 345 deletions

View File

@@ -14,12 +14,16 @@ class Kamal::Cli::App < Kamal::Cli::Base
end
end
#  Primary hosts and roles are returned first, so they can open the barrier
barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many?
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
KAMAL.roles_on(host).each do |role|
Kamal::Cli::App::Boot.new(host, role, version, self).run
Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run
end
end
#  Tag once the app booted on all hosts
on(KAMAL.hosts) do |host|
execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug
execute *KAMAL.app.tag_latest_image

View File

@@ -1,19 +1,30 @@
class Kamal::Cli::App::Boot
attr_reader :host, :role, :version, :sshkit
delegate :execute, :capture_with_info, :info, to: :sshkit
delegate :uses_cord?, :assets?, to: :role
attr_reader :host, :role, :version, :barrier, :sshkit
delegate :execute, :capture_with_info, :capture_with_pretty_json, :info, :error, to: :sshkit
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
def initialize(host, role, version, sshkit)
def initialize(host, role, sshkit, version, barrier)
@host = host
@role = role
@version = version
@barrier = barrier
@sshkit = sshkit
end
def run
old_version = old_version_renamed_if_clashing
start_new_version
wait_at_barrier if queuer?
begin
start_new_version
rescue => e
close_barrier if gatekeeper?
stop_new_version
raise
end
release_barrier if gatekeeper?
if old_version
stop_old_version(old_version)
@@ -21,18 +32,6 @@ class Kamal::Cli::App::Boot
end
private
def app
@app ||= KAMAL.app(role: role, host: host)
end
def auditor
@auditor = KAMAL.auditor(role: role)
end
def audit(message)
execute *auditor.record(message), verbosity: :debug
end
def old_version_renamed_if_clashing
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
@@ -46,12 +45,17 @@ class Kamal::Cli::App::Boot
def start_new_version
audit "Booted app version #{version}"
execute *app.tie_cord(role.cord_host_file) if uses_cord?
hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}"
execute *app.run(hostname: hostname)
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
end
def stop_new_version
execute *app.stop(version: version), raise_on_non_zero_exit: false
end
def stop_old_version(version)
if uses_cord?
cord = capture_with_info(*app.cord(version: version), raise_on_non_zero_exit: false).strip
@@ -65,4 +69,51 @@ class Kamal::Cli::App::Boot
execute *app.clean_up_assets if assets?
end
def release_barrier
if barrier.open
info "First #{KAMAL.primary_role} container is healthy on #{host}, booting other roles"
end
end
def wait_at_barrier
info "Waiting for the first healthy #{KAMAL.primary_role} container before booting #{role} on #{host}..."
barrier.wait
info "First #{KAMAL.primary_role} container is healthy, booting #{role} on #{host}..."
rescue Kamal::Cli::Healthcheck::Error
info "First #{KAMAL.primary_role} container is unhealthy, not booting #{role} on #{host}"
raise
end
def close_barrier
if barrier.close
info "First #{KAMAL.primary_role} container is unhealthy on #{host}, not booting other roles"
error capture_with_info(*app.logs(version: version))
error capture_with_info(*app.container_health_log(version: version))
end
end
def barrier_role?
role == KAMAL.primary_role
end
def app
@app ||= KAMAL.app(role: role, host: host)
end
def auditor
@auditor = KAMAL.auditor(role: role)
end
def audit(message)
execute *auditor.record(message), verbosity: :debug
end
def gatekeeper?
barrier && barrier_role?
end
def queuer?
barrier && !barrier_role?
end
end

View File

@@ -1,21 +0,0 @@
class Kamal::Cli::Healthcheck < Kamal::Cli::Base
default_command :perform
desc "perform", "Health check current app version"
def perform
raise "The primary host is not configured to run Traefik" unless KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
on(KAMAL.primary_host) do
begin
execute *KAMAL.healthcheck.run
Poller.wait_for_healthy { capture_with_info(*KAMAL.healthcheck.status) }
rescue Poller::HealthcheckError => e
error capture_with_info(*KAMAL.healthcheck.logs)
error capture_with_pretty_json(*KAMAL.healthcheck.container_health_log)
raise
ensure
execute *KAMAL.healthcheck.stop, raise_on_non_zero_exit: false
execute *KAMAL.healthcheck.remove, raise_on_non_zero_exit: false
end
end
end
end

View File

@@ -0,0 +1,31 @@
class Kamal::Cli::Healthcheck::Barrier
def initialize
@ivar = Concurrent::IVar.new
end
def close
set(false)
end
def open
set(true)
end
def wait
unless opened?
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
end
end
private
def opened?
@ivar.value
end
def set(value)
@ivar.set(value)
true
rescue Concurrent::MultipleAssignmentError
false
end
end

View File

@@ -0,0 +1,2 @@
class Kamal::Cli::Healthcheck::Error < StandardError
end

View File

@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
TRAEFIK_UPDATE_DELAY = 5
class HealthcheckError < StandardError; end
def wait_for_healthy(pause_after_ready: false, &block)
attempt = 1
@@ -16,9 +15,9 @@ module Kamal::Cli::Healthcheck::Poller
when "running" # No health check configured
sleep KAMAL.config.readiness_delay if pause_after_ready
else
raise HealthcheckError, "container not ready (#{status})"
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
end
rescue HealthcheckError => e
rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
@@ -41,9 +40,9 @@ module Kamal::Cli::Healthcheck::Poller
when "unhealthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
else
raise HealthcheckError, "container not unhealthy (#{status})"
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
end
rescue HealthcheckError => e
rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt

View File

@@ -41,11 +41,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
say "Ensure Traefik is running...", :magenta
invoke "kamal:cli:traefik:boot", [], invoke_options
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
say "Ensure app can pass healthcheck...", :magenta
invoke "kamal:cli:healthcheck:perform", [], invoke_options
end
say "Detect stale containers...", :magenta
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
@@ -76,9 +71,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
run_hook "pre-deploy"
say "Ensure app can pass healthcheck...", :magenta
invoke "kamal:cli:healthcheck:perform", [], invoke_options
say "Detect stale containers...", :magenta
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
@@ -227,9 +219,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
desc "env", "Manage environment files"
subcommand "env", Kamal::Cli::Env
desc "healthcheck", "Healthcheck application"
subcommand "healthcheck", Kamal::Cli::Healthcheck
desc "lock", "Manage the deploy lock"
subcommand "lock", Kamal::Cli::Lock
@@ -254,7 +243,7 @@ class Kamal::Cli::Main < Kamal::Cli::Base
raise "Container not found" unless container_id.present?
end
end
rescue SSHKit::Runner::ExecuteError => e
rescue SSHKit::Runner::ExecuteError, SSHKit::Runner::MultipleExecuteError => e
if e.message =~ /Container not found/
say "Error looking for container version #{version}: #{e.message}"
return false