Merge pull request #740 from basecamp/remove-healthcheck-step
Remove the healthcheck step
This commit is contained in:
@@ -14,12 +14,16 @@ class Kamal::Cli::App < Kamal::Cli::Base
|
||||
end
|
||||
end
|
||||
|
||||
# Primary hosts and roles are returned first, so they can open the barrier
|
||||
barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many?
|
||||
|
||||
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
|
||||
KAMAL.roles_on(host).each do |role|
|
||||
Kamal::Cli::App::Boot.new(host, role, version, self).run
|
||||
Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run
|
||||
end
|
||||
end
|
||||
|
||||
# Tag once the app booted on all hosts
|
||||
on(KAMAL.hosts) do |host|
|
||||
execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug
|
||||
execute *KAMAL.app.tag_latest_image
|
||||
|
||||
@@ -1,19 +1,30 @@
|
||||
class Kamal::Cli::App::Boot
|
||||
attr_reader :host, :role, :version, :sshkit
|
||||
delegate :execute, :capture_with_info, :info, to: :sshkit
|
||||
delegate :uses_cord?, :assets?, to: :role
|
||||
attr_reader :host, :role, :version, :barrier, :sshkit
|
||||
delegate :execute, :capture_with_info, :capture_with_pretty_json, :info, :error, to: :sshkit
|
||||
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
|
||||
|
||||
def initialize(host, role, version, sshkit)
|
||||
def initialize(host, role, sshkit, version, barrier)
|
||||
@host = host
|
||||
@role = role
|
||||
@version = version
|
||||
@barrier = barrier
|
||||
@sshkit = sshkit
|
||||
end
|
||||
|
||||
def run
|
||||
old_version = old_version_renamed_if_clashing
|
||||
|
||||
start_new_version
|
||||
wait_at_barrier if queuer?
|
||||
|
||||
begin
|
||||
start_new_version
|
||||
rescue => e
|
||||
close_barrier if gatekeeper?
|
||||
stop_new_version
|
||||
raise
|
||||
end
|
||||
|
||||
release_barrier if gatekeeper?
|
||||
|
||||
if old_version
|
||||
stop_old_version(old_version)
|
||||
@@ -21,18 +32,6 @@ class Kamal::Cli::App::Boot
|
||||
end
|
||||
|
||||
private
|
||||
def app
|
||||
@app ||= KAMAL.app(role: role, host: host)
|
||||
end
|
||||
|
||||
def auditor
|
||||
@auditor = KAMAL.auditor(role: role)
|
||||
end
|
||||
|
||||
def audit(message)
|
||||
execute *auditor.record(message), verbosity: :debug
|
||||
end
|
||||
|
||||
def old_version_renamed_if_clashing
|
||||
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
|
||||
renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
|
||||
@@ -46,12 +45,17 @@ class Kamal::Cli::App::Boot
|
||||
|
||||
def start_new_version
|
||||
audit "Booted app version #{version}"
|
||||
|
||||
execute *app.tie_cord(role.cord_host_file) if uses_cord?
|
||||
hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}"
|
||||
execute *app.run(hostname: hostname)
|
||||
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
|
||||
end
|
||||
|
||||
def stop_new_version
|
||||
execute *app.stop(version: version), raise_on_non_zero_exit: false
|
||||
end
|
||||
|
||||
def stop_old_version(version)
|
||||
if uses_cord?
|
||||
cord = capture_with_info(*app.cord(version: version), raise_on_non_zero_exit: false).strip
|
||||
@@ -65,4 +69,51 @@ class Kamal::Cli::App::Boot
|
||||
|
||||
execute *app.clean_up_assets if assets?
|
||||
end
|
||||
|
||||
def release_barrier
|
||||
if barrier.open
|
||||
info "First #{KAMAL.primary_role} container is healthy on #{host}, booting other roles"
|
||||
end
|
||||
end
|
||||
|
||||
def wait_at_barrier
|
||||
info "Waiting for the first healthy #{KAMAL.primary_role} container before booting #{role} on #{host}..."
|
||||
barrier.wait
|
||||
info "First #{KAMAL.primary_role} container is healthy, booting #{role} on #{host}..."
|
||||
rescue Kamal::Cli::Healthcheck::Error
|
||||
info "First #{KAMAL.primary_role} container is unhealthy, not booting #{role} on #{host}"
|
||||
raise
|
||||
end
|
||||
|
||||
def close_barrier
|
||||
if barrier.close
|
||||
info "First #{KAMAL.primary_role} container is unhealthy on #{host}, not booting other roles"
|
||||
error capture_with_info(*app.logs(version: version))
|
||||
error capture_with_info(*app.container_health_log(version: version))
|
||||
end
|
||||
end
|
||||
|
||||
def barrier_role?
|
||||
role == KAMAL.primary_role
|
||||
end
|
||||
|
||||
def app
|
||||
@app ||= KAMAL.app(role: role, host: host)
|
||||
end
|
||||
|
||||
def auditor
|
||||
@auditor = KAMAL.auditor(role: role)
|
||||
end
|
||||
|
||||
def audit(message)
|
||||
execute *auditor.record(message), verbosity: :debug
|
||||
end
|
||||
|
||||
def gatekeeper?
|
||||
barrier && barrier_role?
|
||||
end
|
||||
|
||||
def queuer?
|
||||
barrier && !barrier_role?
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
class Kamal::Cli::Healthcheck < Kamal::Cli::Base
|
||||
default_command :perform
|
||||
|
||||
desc "perform", "Health check current app version"
|
||||
def perform
|
||||
raise "The primary host is not configured to run Traefik" unless KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
|
||||
on(KAMAL.primary_host) do
|
||||
begin
|
||||
execute *KAMAL.healthcheck.run
|
||||
Poller.wait_for_healthy { capture_with_info(*KAMAL.healthcheck.status) }
|
||||
rescue Poller::HealthcheckError => e
|
||||
error capture_with_info(*KAMAL.healthcheck.logs)
|
||||
error capture_with_pretty_json(*KAMAL.healthcheck.container_health_log)
|
||||
raise
|
||||
ensure
|
||||
execute *KAMAL.healthcheck.stop, raise_on_non_zero_exit: false
|
||||
execute *KAMAL.healthcheck.remove, raise_on_non_zero_exit: false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
31
lib/kamal/cli/healthcheck/barrier.rb
Normal file
31
lib/kamal/cli/healthcheck/barrier.rb
Normal file
@@ -0,0 +1,31 @@
|
||||
class Kamal::Cli::Healthcheck::Barrier
|
||||
def initialize
|
||||
@ivar = Concurrent::IVar.new
|
||||
end
|
||||
|
||||
def close
|
||||
set(false)
|
||||
end
|
||||
|
||||
def open
|
||||
set(true)
|
||||
end
|
||||
|
||||
def wait
|
||||
unless opened?
|
||||
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
def opened?
|
||||
@ivar.value
|
||||
end
|
||||
|
||||
def set(value)
|
||||
@ivar.set(value)
|
||||
true
|
||||
rescue Concurrent::MultipleAssignmentError
|
||||
false
|
||||
end
|
||||
end
|
||||
2
lib/kamal/cli/healthcheck/error.rb
Normal file
2
lib/kamal/cli/healthcheck/error.rb
Normal file
@@ -0,0 +1,2 @@
|
||||
class Kamal::Cli::Healthcheck::Error < StandardError
|
||||
end
|
||||
@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
|
||||
|
||||
TRAEFIK_UPDATE_DELAY = 5
|
||||
|
||||
class HealthcheckError < StandardError; end
|
||||
|
||||
def wait_for_healthy(pause_after_ready: false, &block)
|
||||
attempt = 1
|
||||
@@ -16,9 +15,9 @@ module Kamal::Cli::Healthcheck::Poller
|
||||
when "running" # No health check configured
|
||||
sleep KAMAL.config.readiness_delay if pause_after_ready
|
||||
else
|
||||
raise HealthcheckError, "container not ready (#{status})"
|
||||
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
|
||||
end
|
||||
rescue HealthcheckError => e
|
||||
rescue Kamal::Cli::Healthcheck::Error => e
|
||||
if attempt <= max_attempts
|
||||
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
@@ -41,9 +40,9 @@ module Kamal::Cli::Healthcheck::Poller
|
||||
when "unhealthy"
|
||||
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
|
||||
else
|
||||
raise HealthcheckError, "container not unhealthy (#{status})"
|
||||
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
|
||||
end
|
||||
rescue HealthcheckError => e
|
||||
rescue Kamal::Cli::Healthcheck::Error => e
|
||||
if attempt <= max_attempts
|
||||
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
|
||||
@@ -41,11 +41,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
say "Ensure Traefik is running...", :magenta
|
||||
invoke "kamal:cli:traefik:boot", [], invoke_options
|
||||
|
||||
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
|
||||
say "Ensure app can pass healthcheck...", :magenta
|
||||
invoke "kamal:cli:healthcheck:perform", [], invoke_options
|
||||
end
|
||||
|
||||
say "Detect stale containers...", :magenta
|
||||
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
|
||||
|
||||
@@ -76,9 +71,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
|
||||
run_hook "pre-deploy"
|
||||
|
||||
say "Ensure app can pass healthcheck...", :magenta
|
||||
invoke "kamal:cli:healthcheck:perform", [], invoke_options
|
||||
|
||||
say "Detect stale containers...", :magenta
|
||||
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
|
||||
|
||||
@@ -227,9 +219,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
desc "env", "Manage environment files"
|
||||
subcommand "env", Kamal::Cli::Env
|
||||
|
||||
desc "healthcheck", "Healthcheck application"
|
||||
subcommand "healthcheck", Kamal::Cli::Healthcheck
|
||||
|
||||
desc "lock", "Manage the deploy lock"
|
||||
subcommand "lock", Kamal::Cli::Lock
|
||||
|
||||
@@ -254,7 +243,7 @@ class Kamal::Cli::Main < Kamal::Cli::Base
|
||||
raise "Container not found" unless container_id.present?
|
||||
end
|
||||
end
|
||||
rescue SSHKit::Runner::ExecuteError => e
|
||||
rescue SSHKit::Runner::ExecuteError, SSHKit::Runner::MultipleExecuteError => e
|
||||
if e.message =~ /Container not found/
|
||||
say "Error looking for container version #{version}: #{e.message}"
|
||||
return false
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
module Kamal::Commands::App::Containers
|
||||
DOCKER_HEALTH_LOG_FORMAT = "'{{json .State.Health}}'"
|
||||
|
||||
def list_containers
|
||||
docker :container, :ls, "--all", *filter_args
|
||||
end
|
||||
@@ -20,4 +22,10 @@ module Kamal::Commands::App::Containers
|
||||
def remove_containers
|
||||
docker :container, :prune, "--force", *filter_args
|
||||
end
|
||||
|
||||
def container_health_log(version:)
|
||||
pipe \
|
||||
container_id_for(container_name: container_name(version)),
|
||||
xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT))
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module Kamal::Commands::App::Logging
|
||||
def logs(since: nil, lines: nil, grep: nil)
|
||||
def logs(version: nil, since: nil, lines: nil, grep: nil)
|
||||
pipe \
|
||||
current_running_container_id,
|
||||
version ? container_id_for_version(version) : current_running_container_id,
|
||||
"xargs docker logs#{" --since #{since}" if since}#{" --tail #{lines}" if lines} 2>&1",
|
||||
("grep '#{grep}'" if grep)
|
||||
end
|
||||
|
||||
@@ -3,7 +3,6 @@ module Kamal::Commands
|
||||
delegate :sensitive, :argumentize, to: Kamal::Utils
|
||||
|
||||
DOCKER_HEALTH_STATUS_FORMAT = "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'"
|
||||
DOCKER_HEALTH_LOG_FORMAT = "'{{json .State.Health}}'"
|
||||
|
||||
attr_accessor :config
|
||||
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
class Kamal::Commands::Healthcheck < Kamal::Commands::Base
|
||||
def run
|
||||
primary = config.role(config.primary_role)
|
||||
|
||||
docker :run,
|
||||
"--detach",
|
||||
"--name", container_name_with_version,
|
||||
"--publish", "#{exposed_port}:#{config.healthcheck["port"]}",
|
||||
"--label", "service=#{config.healthcheck_service}",
|
||||
"-e", "KAMAL_CONTAINER_NAME=\"#{config.healthcheck_service}\"",
|
||||
*primary.env_args(config.primary_host),
|
||||
*primary.health_check_args(cord: false),
|
||||
*config.volume_args,
|
||||
*primary.option_args,
|
||||
config.absolute_image,
|
||||
primary.cmd
|
||||
end
|
||||
|
||||
def status
|
||||
pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT))
|
||||
end
|
||||
|
||||
def container_health_log
|
||||
pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT))
|
||||
end
|
||||
|
||||
def logs
|
||||
pipe container_id, xargs(docker(:logs, "--tail", log_lines, "2>&1"))
|
||||
end
|
||||
|
||||
def stop
|
||||
pipe container_id, xargs(docker(:stop))
|
||||
end
|
||||
|
||||
def remove
|
||||
pipe container_id, xargs(docker(:container, :rm))
|
||||
end
|
||||
|
||||
private
|
||||
def container_name_with_version
|
||||
"#{config.healthcheck_service}-#{config.version}"
|
||||
end
|
||||
|
||||
def container_id
|
||||
container_id_for(container_name: container_name_with_version)
|
||||
end
|
||||
|
||||
def health_url
|
||||
"http://localhost:#{exposed_port}#{config.healthcheck["path"]}"
|
||||
end
|
||||
|
||||
def exposed_port
|
||||
config.healthcheck["exposed_port"]
|
||||
end
|
||||
|
||||
def log_lines
|
||||
config.healthcheck["log_lines"]
|
||||
end
|
||||
end
|
||||
@@ -188,7 +188,7 @@ class Kamal::Configuration
|
||||
|
||||
|
||||
def healthcheck
|
||||
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
|
||||
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
|
||||
end
|
||||
|
||||
def healthcheck_service
|
||||
|
||||
@@ -103,3 +103,39 @@ class SSHKit::Backend::Netssh
|
||||
|
||||
prepend LimitConcurrentStartsInstance
|
||||
end
|
||||
|
||||
class SSHKit::Runner::Parallel
|
||||
# SSHKit joins the threads in sequence and fails on the first error it encounters, which means that we wait threads
|
||||
# before the first failure to complete but not for ones after.
|
||||
#
|
||||
# We'll patch it to wait for them all to complete, and to record all the threads that errored so we can see when a
|
||||
# problem occurs on multiple hosts.
|
||||
module CompleteAll
|
||||
def execute
|
||||
threads = hosts.map do |host|
|
||||
Thread.new(host) do |h|
|
||||
backend(h, &block).run
|
||||
rescue ::StandardError => e
|
||||
e2 = SSHKit::Runner::ExecuteError.new e
|
||||
raise e2, "Exception while executing #{host.user ? "as #{host.user}@" : "on host "}#{host}: #{e.message}"
|
||||
end
|
||||
end
|
||||
|
||||
exceptions = []
|
||||
threads.each do |t|
|
||||
begin
|
||||
t.join
|
||||
rescue SSHKit::Runner::ExecuteError => e
|
||||
exceptions << e
|
||||
end
|
||||
end
|
||||
if exceptions.one?
|
||||
raise exceptions.first
|
||||
elsif exceptions.many?
|
||||
raise exceptions.first, [ "Exceptions on #{exceptions.count} hosts:", exceptions.map(&:message) ].join("\n")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
prepend CompleteAll
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user