Zero downtime deployment with cord file
When replacing a container currently we:
1. Boot the new container
2. Wait for it to become healthy
3. Stop the old container
Traefik will send requests to the old container until it notices that it
is unhealthy. But it may have stopped serving requests before that point
which can result in errors.
To get round that the new boot process is:
1. Create a directory with a single file on the host
2. Boot the new container, mounting the cord file into /tmp and
including a check for the file in the docker healthcheck
3. Wait for it to become healthy
4. Delete the healthcheck file ("cut the cord") for the old container
5. Wait for it to become unhealthy and give Traefik a couple of seconds
to notice
6. Stop the old container
The extra steps ensure that Traefik stops sending requests before the
old container is shutdown.
This commit is contained in:
@@ -18,8 +18,9 @@ class Kamal::Cli::App < Kamal::Cli::Base
|
||||
roles.each do |role|
|
||||
app = KAMAL.app(role: role)
|
||||
auditor = KAMAL.auditor(role: role)
|
||||
role_config = KAMAL.config.role(role)
|
||||
|
||||
if capture_with_info(*app.container_id_for_version(version, only_running: true), raise_on_non_zero_exit: false).present?
|
||||
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
|
||||
tmp_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
|
||||
info "Renaming container #{version} to #{tmp_version} as already deployed on #{host}"
|
||||
execute *auditor.record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
|
||||
@@ -29,11 +30,25 @@ class Kamal::Cli::App < Kamal::Cli::Base
|
||||
execute *auditor.record("Booted app version #{version}"), verbosity: :debug
|
||||
|
||||
old_version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
|
||||
execute *app.start_or_run(hostname: "#{host}-#{SecureRandom.hex(6)}")
|
||||
|
||||
if role_config.uses_cord?
|
||||
execute *app.tie_cord(role_config.cord_host_file)
|
||||
end
|
||||
|
||||
execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")
|
||||
|
||||
Kamal::Utils::HealthcheckPoller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
|
||||
|
||||
execute *app.stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
|
||||
if old_version.present?
|
||||
if role_config.uses_cord?
|
||||
cord = capture_with_info(*app.cord(version: old_version), raise_on_non_zero_exit: false).strip
|
||||
if cord.present?
|
||||
execute *app.cut_cord(cord)
|
||||
Kamal::Utils::HealthcheckPoller.wait_for_unhealthy(pause_after_ready: true) { capture_with_info(*app.status(version: old_version)) }
|
||||
end
|
||||
end
|
||||
execute *app.stop(version: old_version), raise_on_non_zero_exit: false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,34 +1,29 @@
|
||||
class Kamal::Commands::App < Kamal::Commands::Base
|
||||
ACTIVE_DOCKER_STATUSES = [ :running, :restarting ]
|
||||
|
||||
attr_reader :role
|
||||
attr_reader :role, :role_config
|
||||
|
||||
def initialize(config, role: nil)
|
||||
super(config)
|
||||
@role = role
|
||||
end
|
||||
|
||||
def start_or_run(hostname: nil)
|
||||
combine start, run(hostname: hostname), by: "||"
|
||||
@role_config = config.role(self.role)
|
||||
end
|
||||
|
||||
def run(hostname: nil)
|
||||
role = config.role(self.role)
|
||||
|
||||
docker :run,
|
||||
"--detach",
|
||||
"--restart unless-stopped",
|
||||
"--name", container_name,
|
||||
*(["--hostname", hostname] if hostname),
|
||||
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
|
||||
*role.env_args,
|
||||
*role.health_check_args,
|
||||
*role_config.env_args,
|
||||
*role_config.health_check_args,
|
||||
*config.logging_args,
|
||||
*config.volume_args,
|
||||
*role.label_args,
|
||||
*role.option_args,
|
||||
*role_config.label_args,
|
||||
*role_config.option_args,
|
||||
config.absolute_image,
|
||||
role.cmd
|
||||
role_config.cmd
|
||||
end
|
||||
|
||||
def start
|
||||
@@ -76,14 +71,12 @@ class Kamal::Commands::App < Kamal::Commands::Base
|
||||
end
|
||||
|
||||
def execute_in_new_container(*command, interactive: false)
|
||||
role = config.role(self.role)
|
||||
|
||||
docker :run,
|
||||
("-it" if interactive),
|
||||
"--rm",
|
||||
*role&.env_args,
|
||||
*role_config&.env_args,
|
||||
*config.volume_args,
|
||||
*role&.option_args,
|
||||
*role_config&.option_args,
|
||||
config.absolute_image,
|
||||
*command
|
||||
end
|
||||
@@ -112,7 +105,7 @@ class Kamal::Commands::App < Kamal::Commands::Base
|
||||
def list_versions(*docker_args, statuses: nil)
|
||||
pipe \
|
||||
docker(:ps, *filter_args(statuses: statuses), *docker_args, "--format", '"{{.Names}}"'),
|
||||
%(while read line; do echo ${line##{service_role_dest}-}; done) # Extract SHA from "service-role-dest-SHA"
|
||||
%(while read line; do echo ${line##{role_config.full_name}-}; done) # Extract SHA from "service-role-dest-SHA"
|
||||
end
|
||||
|
||||
def list_containers
|
||||
@@ -150,16 +143,30 @@ class Kamal::Commands::App < Kamal::Commands::Base
|
||||
end
|
||||
|
||||
def make_env_directory
|
||||
make_directory config.role(role).host_env_directory
|
||||
make_directory role_config.host_env_directory
|
||||
end
|
||||
|
||||
def remove_env_file
|
||||
[:rm, "-f", config.role(role).host_env_file_path]
|
||||
[:rm, "-f", role_config.host_env_file_path]
|
||||
end
|
||||
|
||||
def cord(version:)
|
||||
pipe \
|
||||
docker(:inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", container_name(version)),
|
||||
[:awk, "'$2 == \"#{role_config.cord_container_directory}\" {print $1}'"]
|
||||
end
|
||||
|
||||
def tie_cord(cord)
|
||||
create_empty_file(cord)
|
||||
end
|
||||
|
||||
def cut_cord(cord)
|
||||
remove_directory(cord)
|
||||
end
|
||||
|
||||
private
|
||||
def container_name(version = nil)
|
||||
[ config.service, role, config.destination, version || config.version ].compact.join("-")
|
||||
[ role_config.full_name, version || config.version ].compact.join("-")
|
||||
end
|
||||
|
||||
def filter_args(statuses: nil)
|
||||
|
||||
@@ -34,6 +34,10 @@ module Kamal::Commands
|
||||
[ :mkdir, "-p", path ]
|
||||
end
|
||||
|
||||
def remove_directory(path)
|
||||
[ :rm, "-r", path ]
|
||||
end
|
||||
|
||||
private
|
||||
def combine(*commands, by: "&&")
|
||||
commands
|
||||
@@ -69,5 +73,11 @@ module Kamal::Commands
|
||||
def tags(**details)
|
||||
Kamal::Tags.from_config(config, **details)
|
||||
end
|
||||
|
||||
def create_empty_file(file)
|
||||
chain \
|
||||
make_directory_for(file),
|
||||
[:touch, file]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -10,7 +10,7 @@ class Kamal::Commands::Healthcheck < Kamal::Commands::Base
|
||||
"--label", "service=#{container_name}",
|
||||
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
|
||||
*web.env_args,
|
||||
*web.health_check_args,
|
||||
*web.health_check_args(cord: false),
|
||||
*config.volume_args,
|
||||
*web.option_args,
|
||||
config.absolute_image,
|
||||
|
||||
@@ -61,6 +61,14 @@ class Kamal::Configuration
|
||||
raw_config.run_directory || ".kamal"
|
||||
end
|
||||
|
||||
def run_directory_as_docker_volume
|
||||
if Pathname.new(run_directory).absolute?
|
||||
run_directory
|
||||
else
|
||||
File.join "$(pwd)", run_directory
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
def roles
|
||||
@roles ||= role_names.collect { |role_name| Role.new(role_name, config: self) }
|
||||
@@ -141,7 +149,7 @@ class Kamal::Configuration
|
||||
|
||||
|
||||
def healthcheck
|
||||
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999 }.merge(raw_config.healthcheck || {})
|
||||
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord" }.merge(raw_config.healthcheck || {})
|
||||
end
|
||||
|
||||
def readiness_delay
|
||||
@@ -199,6 +207,10 @@ class Kamal::Configuration
|
||||
"#{run_directory}/env"
|
||||
end
|
||||
|
||||
def run_id
|
||||
@run_id ||= SecureRandom.hex(16)
|
||||
end
|
||||
|
||||
private
|
||||
# Will raise ArgumentError if any required config keys are missing
|
||||
def ensure_required_keys_present
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
class Kamal::Configuration::Role
|
||||
CORD_FILE = "cord"
|
||||
delegate :argumentize, :env_file_with_secrets, :optionize, to: Kamal::Utils
|
||||
|
||||
attr_accessor :name
|
||||
@@ -47,28 +48,52 @@ class Kamal::Configuration::Role
|
||||
argumentize "--env-file", host_env_file_path
|
||||
end
|
||||
|
||||
def health_check_args
|
||||
def health_check_args(cord: true)
|
||||
if health_check_cmd.present?
|
||||
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
|
||||
if cord && uses_cord?
|
||||
optionize({ "health-cmd" => health_check_cmd_with_cord, "health-interval" => health_check_interval })
|
||||
.concat(["--volume", "#{cord_host_directory}:#{cord_container_directory}"])
|
||||
else
|
||||
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
|
||||
end
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
def health_check_cmd
|
||||
options = specializations["healthcheck"] || {}
|
||||
options = config.healthcheck.merge(options) if running_traefik?
|
||||
health_check_options["cmd"] || http_health_check(port: health_check_options["port"], path: health_check_options["path"])
|
||||
end
|
||||
|
||||
options["cmd"] || http_health_check(port: options["port"], path: options["path"])
|
||||
def health_check_cmd_with_cord
|
||||
"(#{health_check_cmd}) && (stat #{cord_container_file} > /dev/null || exit 1)"
|
||||
end
|
||||
|
||||
def health_check_interval
|
||||
options = specializations["healthcheck"] || {}
|
||||
options = config.healthcheck.merge(options) if running_traefik?
|
||||
|
||||
options["interval"] || "1s"
|
||||
health_check_options["interval"] || "1s"
|
||||
end
|
||||
|
||||
def uses_cord?
|
||||
running_traefik? && cord_container_directory.present? && health_check_cmd.present?
|
||||
end
|
||||
|
||||
def cord_host_directory
|
||||
File.join config.run_directory_as_docker_volume, "cords", [full_name, config.run_id].join("-")
|
||||
end
|
||||
|
||||
def cord_host_file
|
||||
File.join cord_host_directory, CORD_FILE
|
||||
end
|
||||
|
||||
def cord_container_directory
|
||||
health_check_options.fetch("cord", nil)
|
||||
end
|
||||
|
||||
def cord_container_file
|
||||
File.join cord_container_directory, CORD_FILE
|
||||
end
|
||||
|
||||
|
||||
def cmd
|
||||
specializations["cmd"]
|
||||
end
|
||||
@@ -85,6 +110,10 @@ class Kamal::Configuration::Role
|
||||
name.web? || specializations["traefik"]
|
||||
end
|
||||
|
||||
def full_name
|
||||
[ config.service, name, config.destination ].compact.join("-")
|
||||
end
|
||||
|
||||
private
|
||||
attr_accessor :config
|
||||
|
||||
@@ -164,4 +193,12 @@ class Kamal::Configuration::Role
|
||||
def http_health_check(port:, path:)
|
||||
"curl -f #{URI.join("http://localhost:#{port}", path)} || exit 1" if path.present? || port.present?
|
||||
end
|
||||
|
||||
def health_check_options
|
||||
@health_check_options ||= begin
|
||||
options = specializations["healthcheck"] || {}
|
||||
options = config.healthcheck.merge(options) if running_traefik?
|
||||
options
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
class Kamal::Utils::HealthcheckPoller
|
||||
TRAEFIK_HEALTHY_DELAY = 2
|
||||
TRAEFIK_UPDATE_DELAY = 2
|
||||
|
||||
class HealthcheckError < StandardError; end
|
||||
|
||||
@@ -11,7 +11,7 @@ class Kamal::Utils::HealthcheckPoller
|
||||
begin
|
||||
case status = block.call
|
||||
when "healthy"
|
||||
sleep TRAEFIK_HEALTHY_DELAY if pause_after_ready
|
||||
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
|
||||
when "running" # No health check configured
|
||||
sleep KAMAL.config.readiness_delay if pause_after_ready
|
||||
else
|
||||
@@ -31,6 +31,31 @@ class Kamal::Utils::HealthcheckPoller
|
||||
info "Container is healthy!"
|
||||
end
|
||||
|
||||
def wait_for_unhealthy(pause_after_ready: false, &block)
|
||||
attempt = 1
|
||||
max_attempts = KAMAL.config.healthcheck["max_attempts"]
|
||||
|
||||
begin
|
||||
case status = block.call
|
||||
when "unhealthy"
|
||||
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
|
||||
else
|
||||
raise HealthcheckError, "container not unhealthy (#{status})"
|
||||
end
|
||||
rescue HealthcheckError => e
|
||||
if attempt <= max_attempts
|
||||
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
|
||||
sleep attempt
|
||||
attempt += 1
|
||||
retry
|
||||
else
|
||||
raise
|
||||
end
|
||||
end
|
||||
|
||||
info "Container is unhealthy!"
|
||||
end
|
||||
|
||||
private
|
||||
def info(message)
|
||||
SSHKit.config.output.info(message)
|
||||
|
||||
Reference in New Issue
Block a user