Move health checks into Docker
Replaces our current host-based HTTP healthchecks with Docker healthchecks, and adds a new `healthcheck.cmd` config option that can be used to define a custom health check command. Also removes Traefik's healthchecks, since they are no longer necessary. When deploying a container that has a healthcheck defined, we wait for it to report a healthy status before stopping the old container that it replaces. Containers that don't have a healthcheck defined continue to wait for `MRSK.config.readiness_delay`. There are some pros and cons to using Docker healthchecks rather than checking from the host. The main advantages are: - Supports non-HTTP checks, and app-specific check scripts provided by a container. - When booting a container, allows MRSK to wait for a container to be healthy before shutting down the old container it replaces. This should be safer than relying on a timeout. - Containers with healthchecks won't be active in Traefik until they reach a healthy state, which prevents any traffic from being routed to them before they are ready. The main _disadvantage_ is that containers are now required to provide some way to check their health. Our default check assumes that `curl` is available in the container which, while common, won't always be the case.
This commit is contained in:
@@ -6,8 +6,6 @@ class Mrsk::Cli::App < Mrsk::Cli::Base
|
||||
using_version(version_or_latest) do |version|
|
||||
say "Start container with version #{version} using a #{MRSK.config.readiness_delay}s readiness delay (or reboot if already running)...", :magenta
|
||||
|
||||
cli = self
|
||||
|
||||
on(MRSK.hosts) do
|
||||
execute *MRSK.auditor.record("Tagging #{MRSK.config.absolute_image} as the latest image"), verbosity: :debug
|
||||
execute *MRSK.app.tag_current_as_latest
|
||||
@@ -17,19 +15,24 @@ class Mrsk::Cli::App < Mrsk::Cli::Base
|
||||
roles = MRSK.roles_on(host)
|
||||
|
||||
roles.each do |role|
|
||||
execute *MRSK.auditor(role: role).record("Booted app version #{version}"), verbosity: :debug
|
||||
app = MRSK.app(role: role)
|
||||
auditor = MRSK.auditor(role: role)
|
||||
|
||||
if capture_with_info(*MRSK.app(role: role).container_id_for_version(version), raise_on_non_zero_exit: false).present?
|
||||
execute *auditor.record("Booted app version #{version}"), verbosity: :debug
|
||||
|
||||
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
|
||||
tmp_version = "#{version}_#{SecureRandom.hex(8)}"
|
||||
info "Renaming container #{version} to #{tmp_version} as already deployed on #{host}"
|
||||
execute *MRSK.auditor(role: role).record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
|
||||
execute *MRSK.app(role: role).rename_container(version: version, new_version: tmp_version)
|
||||
execute *auditor.record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
|
||||
execute *app.rename_container(version: version, new_version: tmp_version)
|
||||
end
|
||||
|
||||
old_version = capture_with_info(*MRSK.app(role: role).current_running_version, raise_on_non_zero_exit: false).strip
|
||||
execute *MRSK.app(role: role).run
|
||||
sleep MRSK.config.readiness_delay
|
||||
execute *MRSK.app(role: role).stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
|
||||
old_version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
|
||||
execute *app.run
|
||||
|
||||
Mrsk::Utils::HealthcheckPoller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
|
||||
|
||||
execute *app.stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
class Mrsk::Cli::Healthcheck < Mrsk::Cli::Base
|
||||
|
||||
class HealthcheckError < StandardError; end
|
||||
|
||||
default_command :perform
|
||||
|
||||
desc "perform", "Health check current app version"
|
||||
@@ -9,38 +6,10 @@ class Mrsk::Cli::Healthcheck < Mrsk::Cli::Base
|
||||
on(MRSK.primary_host) do
|
||||
begin
|
||||
execute *MRSK.healthcheck.run
|
||||
|
||||
target = "Health check against #{MRSK.config.healthcheck["path"]}"
|
||||
attempt = 1
|
||||
max_attempts = MRSK.config.healthcheck["max_attempts"]
|
||||
|
||||
begin
|
||||
status = capture_with_info(*MRSK.healthcheck.curl)
|
||||
|
||||
if status == "200"
|
||||
info "#{target} succeeded with 200 OK!"
|
||||
else
|
||||
raise HealthcheckError, "#{target} failed with status #{status}"
|
||||
end
|
||||
rescue SSHKit::Command::Failed
|
||||
if attempt <= max_attempts
|
||||
info "#{target} failed to respond, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
attempt += 1
|
||||
|
||||
retry
|
||||
else
|
||||
raise
|
||||
end
|
||||
end
|
||||
rescue SSHKit::Command::Failed, HealthcheckError => e
|
||||
Mrsk::Utils::HealthcheckPoller.wait_for_healthy { capture_with_info(*MRSK.healthcheck.status) }
|
||||
rescue Mrsk::Utils::HealthcheckPoller::HealthcheckError => e
|
||||
error capture_with_info(*MRSK.healthcheck.logs)
|
||||
|
||||
if e.message =~ /curl/
|
||||
raise SSHKit::Command::Failed, "#{target} failed to return 200 OK!"
|
||||
else
|
||||
raise
|
||||
end
|
||||
raise
|
||||
ensure
|
||||
execute *MRSK.healthcheck.stop, raise_on_non_zero_exit: false
|
||||
execute *MRSK.healthcheck.remove, raise_on_non_zero_exit: false
|
||||
|
||||
@@ -15,6 +15,7 @@ class Mrsk::Commands::App < Mrsk::Commands::Base
|
||||
"--name", container_name,
|
||||
"-e", "MRSK_CONTAINER_NAME=\"#{container_name}\"",
|
||||
*role.env_args,
|
||||
*role.health_check_args,
|
||||
*config.logging_args,
|
||||
*config.volume_args,
|
||||
*role.label_args,
|
||||
@@ -27,6 +28,10 @@ class Mrsk::Commands::App < Mrsk::Commands::Base
|
||||
docker :start, container_name
|
||||
end
|
||||
|
||||
def status(version:)
|
||||
pipe container_id_for_version(version), xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT))
|
||||
end
|
||||
|
||||
def stop(version: nil)
|
||||
pipe \
|
||||
version ? container_id_for_version(version) : current_running_container_id,
|
||||
|
||||
@@ -2,6 +2,8 @@ module Mrsk::Commands
|
||||
class Base
|
||||
delegate :sensitive, :argumentize, to: Mrsk::Utils
|
||||
|
||||
DOCKER_HEALTH_STATUS_FORMAT = "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'"
|
||||
|
||||
attr_accessor :config
|
||||
|
||||
def initialize(config)
|
||||
|
||||
@@ -11,14 +11,15 @@ class Mrsk::Commands::Healthcheck < Mrsk::Commands::Base
|
||||
"--label", "service=#{container_name}",
|
||||
"-e", "MRSK_CONTAINER_NAME=\"#{container_name}\"",
|
||||
*web.env_args,
|
||||
*web.health_check_args,
|
||||
*config.volume_args,
|
||||
*web.option_args,
|
||||
config.absolute_image,
|
||||
web.cmd
|
||||
end
|
||||
|
||||
def curl
|
||||
[ :curl, "--silent", "--output", "/dev/null", "--write-out", "'%{http_code}'", "--max-time", "2", health_url ]
|
||||
def status
|
||||
pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT))
|
||||
end
|
||||
|
||||
def logs
|
||||
|
||||
@@ -35,6 +35,21 @@ class Mrsk::Configuration::Role
|
||||
argumentize_env_with_secrets env
|
||||
end
|
||||
|
||||
def health_check_args
|
||||
if health_check_cmd.present?
|
||||
optionize({ "health-cmd" => health_check_cmd, "health-interval" => "1s" })
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
def health_check_cmd
|
||||
options = specializations["healthcheck"] || {}
|
||||
options = config.healthcheck.merge(options) if running_traefik?
|
||||
|
||||
options["cmd"] || http_health_check(port: options["port"], path: options["path"])
|
||||
end
|
||||
|
||||
def cmd
|
||||
specializations["cmd"]
|
||||
end
|
||||
@@ -75,8 +90,6 @@ class Mrsk::Configuration::Role
|
||||
if running_traefik?
|
||||
{
|
||||
"traefik.http.routers.#{traefik_service}.rule" => "PathPrefix(`/`)",
|
||||
"traefik.http.services.#{traefik_service}.loadbalancer.healthcheck.path" => config.healthcheck["path"],
|
||||
"traefik.http.services.#{traefik_service}.loadbalancer.healthcheck.interval" => "1s",
|
||||
"traefik.http.middlewares.#{traefik_service}-retry.retry.attempts" => "5",
|
||||
"traefik.http.middlewares.#{traefik_service}-retry.retry.initialinterval" => "500ms",
|
||||
"traefik.http.routers.#{traefik_service}.middlewares" => "#{traefik_service}-retry@docker"
|
||||
@@ -125,4 +138,8 @@ class Mrsk::Configuration::Role
|
||||
new_env["clear"] = (clear_app_env + clear_role_env).uniq
|
||||
end
|
||||
end
|
||||
|
||||
def http_health_check(port:, path:)
|
||||
"curl -f #{URI.join("http://localhost:#{port}", path)} || exit 1" if path.present? || port.present?
|
||||
end
|
||||
end
|
||||
|
||||
39
lib/mrsk/utils/healthcheck_poller.rb
Normal file
39
lib/mrsk/utils/healthcheck_poller.rb
Normal file
@@ -0,0 +1,39 @@
|
||||
class Mrsk::Utils::HealthcheckPoller
|
||||
TRAEFIK_HEALTHY_DELAY = 1
|
||||
|
||||
class HealthcheckError < StandardError; end
|
||||
|
||||
class << self
|
||||
def wait_for_healthy(pause_after_ready: false, &block)
|
||||
attempt = 1
|
||||
max_attempts = MRSK.config.healthcheck["max_attempts"]
|
||||
|
||||
begin
|
||||
case status = block.call
|
||||
when "healthy"
|
||||
sleep TRAEFIK_HEALTHY_DELAY if pause_after_ready
|
||||
when "running" # No health check configured
|
||||
sleep MRSK.config.readiness_delay if pause_after_ready
|
||||
else
|
||||
raise HealthcheckError, "container not ready (#{status})"
|
||||
end
|
||||
rescue HealthcheckError => e
|
||||
if attempt <= max_attempts
|
||||
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
attempt += 1
|
||||
retry
|
||||
else
|
||||
raise
|
||||
end
|
||||
end
|
||||
|
||||
info "Container is healthy!"
|
||||
end
|
||||
|
||||
private
|
||||
def info(message)
|
||||
SSHKit.config.output.info(message)
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user