Zero downtime deployment with cord file

When replacing a container currently we:
1. Boot the new container
2. Wait for it to become healthy
3. Stop the old container

Traefik will send requests to the old container until it notices that it
is unhealthy. But it may have stopped serving requests before that point
which can result in errors.

To get round that the new boot process is:

1. Create a directory with a single file on the host
2. Boot the new container, mounting the cord file into /tmp and
including a check for the file in the docker healthcheck
3. Wait for it to become healthy
4. Delete the healthcheck file ("cut the cord") for the old container
5. Wait for it to become unhealthy and give Traefik a couple of seconds
to notice
6. Stop the old container

The extra steps ensure that Traefik stops sending requests before the
old container is shutdown.
This commit is contained in:
Donal McBreen
2023-08-31 10:21:57 +01:00
parent 94bf090657
commit 8a41d15b69
17 changed files with 234 additions and 65 deletions

View File

@@ -18,8 +18,9 @@ class Kamal::Cli::App < Kamal::Cli::Base
roles.each do |role|
app = KAMAL.app(role: role)
auditor = KAMAL.auditor(role: role)
role_config = KAMAL.config.role(role)
if capture_with_info(*app.container_id_for_version(version, only_running: true), raise_on_non_zero_exit: false).present?
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
tmp_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
info "Renaming container #{version} to #{tmp_version} as already deployed on #{host}"
execute *auditor.record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
@@ -29,11 +30,25 @@ class Kamal::Cli::App < Kamal::Cli::Base
execute *auditor.record("Booted app version #{version}"), verbosity: :debug
old_version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
execute *app.start_or_run(hostname: "#{host}-#{SecureRandom.hex(6)}")
if role_config.uses_cord?
execute *app.tie_cord(role_config.cord_host_file)
end
execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")
Kamal::Utils::HealthcheckPoller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
execute *app.stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
if old_version.present?
if role_config.uses_cord?
cord = capture_with_info(*app.cord(version: old_version), raise_on_non_zero_exit: false).strip
if cord.present?
execute *app.cut_cord(cord)
Kamal::Utils::HealthcheckPoller.wait_for_unhealthy(pause_after_ready: true) { capture_with_info(*app.status(version: old_version)) }
end
end
execute *app.stop(version: old_version), raise_on_non_zero_exit: false
end
end
end
end

View File

@@ -1,34 +1,29 @@
class Kamal::Commands::App < Kamal::Commands::Base
ACTIVE_DOCKER_STATUSES = [ :running, :restarting ]
attr_reader :role
attr_reader :role, :role_config
def initialize(config, role: nil)
super(config)
@role = role
end
def start_or_run(hostname: nil)
combine start, run(hostname: hostname), by: "||"
@role_config = config.role(self.role)
end
def run(hostname: nil)
role = config.role(self.role)
docker :run,
"--detach",
"--restart unless-stopped",
"--name", container_name,
*(["--hostname", hostname] if hostname),
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
*role.env_args,
*role.health_check_args,
*role_config.env_args,
*role_config.health_check_args,
*config.logging_args,
*config.volume_args,
*role.label_args,
*role.option_args,
*role_config.label_args,
*role_config.option_args,
config.absolute_image,
role.cmd
role_config.cmd
end
def start
@@ -76,14 +71,12 @@ class Kamal::Commands::App < Kamal::Commands::Base
end
def execute_in_new_container(*command, interactive: false)
role = config.role(self.role)
docker :run,
("-it" if interactive),
"--rm",
*role&.env_args,
*role_config&.env_args,
*config.volume_args,
*role&.option_args,
*role_config&.option_args,
config.absolute_image,
*command
end
@@ -112,7 +105,7 @@ class Kamal::Commands::App < Kamal::Commands::Base
def list_versions(*docker_args, statuses: nil)
pipe \
docker(:ps, *filter_args(statuses: statuses), *docker_args, "--format", '"{{.Names}}"'),
%(while read line; do echo ${line##{service_role_dest}-}; done) # Extract SHA from "service-role-dest-SHA"
%(while read line; do echo ${line##{role_config.full_name}-}; done) # Extract SHA from "service-role-dest-SHA"
end
def list_containers
@@ -150,16 +143,30 @@ class Kamal::Commands::App < Kamal::Commands::Base
end
def make_env_directory
make_directory config.role(role).host_env_directory
make_directory role_config.host_env_directory
end
def remove_env_file
[:rm, "-f", config.role(role).host_env_file_path]
[:rm, "-f", role_config.host_env_file_path]
end
def cord(version:)
pipe \
docker(:inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", container_name(version)),
[:awk, "'$2 == \"#{role_config.cord_container_directory}\" {print $1}'"]
end
def tie_cord(cord)
create_empty_file(cord)
end
def cut_cord(cord)
remove_directory(cord)
end
private
def container_name(version = nil)
[ config.service, role, config.destination, version || config.version ].compact.join("-")
[ role_config.full_name, version || config.version ].compact.join("-")
end
def filter_args(statuses: nil)

View File

@@ -34,6 +34,10 @@ module Kamal::Commands
[ :mkdir, "-p", path ]
end
def remove_directory(path)
[ :rm, "-r", path ]
end
private
def combine(*commands, by: "&&")
commands
@@ -69,5 +73,11 @@ module Kamal::Commands
def tags(**details)
Kamal::Tags.from_config(config, **details)
end
def create_empty_file(file)
chain \
make_directory_for(file),
[:touch, file]
end
end
end

View File

@@ -10,7 +10,7 @@ class Kamal::Commands::Healthcheck < Kamal::Commands::Base
"--label", "service=#{container_name}",
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
*web.env_args,
*web.health_check_args,
*web.health_check_args(cord: false),
*config.volume_args,
*web.option_args,
config.absolute_image,

View File

@@ -61,6 +61,14 @@ class Kamal::Configuration
raw_config.run_directory || ".kamal"
end
def run_directory_as_docker_volume
if Pathname.new(run_directory).absolute?
run_directory
else
File.join "$(pwd)", run_directory
end
end
def roles
@roles ||= role_names.collect { |role_name| Role.new(role_name, config: self) }
@@ -141,7 +149,7 @@ class Kamal::Configuration
def healthcheck
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999 }.merge(raw_config.healthcheck || {})
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord" }.merge(raw_config.healthcheck || {})
end
def readiness_delay
@@ -199,6 +207,10 @@ class Kamal::Configuration
"#{run_directory}/env"
end
def run_id
@run_id ||= SecureRandom.hex(16)
end
private
# Will raise ArgumentError if any required config keys are missing
def ensure_required_keys_present

View File

@@ -1,4 +1,5 @@
class Kamal::Configuration::Role
CORD_FILE = "cord"
delegate :argumentize, :env_file_with_secrets, :optionize, to: Kamal::Utils
attr_accessor :name
@@ -47,28 +48,52 @@ class Kamal::Configuration::Role
argumentize "--env-file", host_env_file_path
end
def health_check_args
def health_check_args(cord: true)
if health_check_cmd.present?
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
if cord && uses_cord?
optionize({ "health-cmd" => health_check_cmd_with_cord, "health-interval" => health_check_interval })
.concat(["--volume", "#{cord_host_directory}:#{cord_container_directory}"])
else
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
end
else
[]
end
end
def health_check_cmd
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
health_check_options["cmd"] || http_health_check(port: health_check_options["port"], path: health_check_options["path"])
end
options["cmd"] || http_health_check(port: options["port"], path: options["path"])
def health_check_cmd_with_cord
"(#{health_check_cmd}) && (stat #{cord_container_file} > /dev/null || exit 1)"
end
def health_check_interval
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
options["interval"] || "1s"
health_check_options["interval"] || "1s"
end
def uses_cord?
running_traefik? && cord_container_directory.present? && health_check_cmd.present?
end
def cord_host_directory
File.join config.run_directory_as_docker_volume, "cords", [full_name, config.run_id].join("-")
end
def cord_host_file
File.join cord_host_directory, CORD_FILE
end
def cord_container_directory
health_check_options.fetch("cord", nil)
end
def cord_container_file
File.join cord_container_directory, CORD_FILE
end
def cmd
specializations["cmd"]
end
@@ -85,6 +110,10 @@ class Kamal::Configuration::Role
name.web? || specializations["traefik"]
end
def full_name
[ config.service, name, config.destination ].compact.join("-")
end
private
attr_accessor :config
@@ -164,4 +193,12 @@ class Kamal::Configuration::Role
def http_health_check(port:, path:)
"curl -f #{URI.join("http://localhost:#{port}", path)} || exit 1" if path.present? || port.present?
end
def health_check_options
@health_check_options ||= begin
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
options
end
end
end

View File

@@ -1,5 +1,5 @@
class Kamal::Utils::HealthcheckPoller
TRAEFIK_HEALTHY_DELAY = 2
TRAEFIK_UPDATE_DELAY = 2
class HealthcheckError < StandardError; end
@@ -11,7 +11,7 @@ class Kamal::Utils::HealthcheckPoller
begin
case status = block.call
when "healthy"
sleep TRAEFIK_HEALTHY_DELAY if pause_after_ready
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
when "running" # No health check configured
sleep KAMAL.config.readiness_delay if pause_after_ready
else
@@ -31,6 +31,31 @@ class Kamal::Utils::HealthcheckPoller
info "Container is healthy!"
end
def wait_for_unhealthy(pause_after_ready: false, &block)
attempt = 1
max_attempts = KAMAL.config.healthcheck["max_attempts"]
begin
case status = block.call
when "unhealthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
else
raise HealthcheckError, "container not unhealthy (#{status})"
end
rescue HealthcheckError => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1
retry
else
raise
end
end
info "Container is unhealthy!"
end
private
def info(message)
SSHKit.config.output.info(message)