Replace Traefik with kamal-proxy

[kamal-proxy](https://github.com/basecamp/kamal-proxy) is a custom
minimal proxy designed specifically for Kamal.

It has some advantages over Traefik:
1. Imperative deployments - we tell it to switch from container A to
   container B, and it waits for container B to start then switches. No
   need to poll for health checks ourselves or mess around with forcing
   health checks to fail.
2. Support for multiple apps - as much as possible, configuration is
   supplied at runtime by the deploy command, allowing us to have
   multiple apps share a proxy without conflicting config.
3. First class support for Kamal operations - rather than trying to
   work out how to make Traefik do what we want, we can build features
   directly into the proxy, making configuration simpler and avoiding
   obscure errors
This commit is contained in:
Donal McBreen
2024-03-08 08:19:48 +00:00
parent 90ecb6a12a
commit 6568cef868
66 changed files with 746 additions and 1161 deletions

View File

@@ -9,16 +9,16 @@ class Kamal::Cli::App < Kamal::Cli::Base
# Assets are prepared in a separate step to ensure they are on all hosts before booting
on(KAMAL.hosts) do
KAMAL.roles_on(host).each do |role|
Kamal::Cli::App::PrepareAssets.new(host, role, self).run
PrepareAssets.new(host, role, self).run
end
end
# Primary hosts and roles are returned first, so they can open the barrier
barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many?
barrier = Barrier.new if KAMAL.roles.many?
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
KAMAL.roles_on(host).each do |role|
Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run
Boot.new(host, role, self, version, barrier).run
end
end
@@ -38,8 +38,17 @@ class Kamal::Cli::App < Kamal::Cli::Base
roles = KAMAL.roles_on(host)
roles.each do |role|
app = KAMAL.app(role: role, host: host)
execute *KAMAL.auditor.record("Started app version #{KAMAL.config.version}"), verbosity: :debug
execute *KAMAL.app(role: role, host: host).start, raise_on_non_zero_exit: false
execute *app.start, raise_on_non_zero_exit: false
if role.running_proxy?
version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
endpoint = capture_with_info(*app.container_endpoint(version: version)).strip
raise Kamal::Cli::BootError, "Failed to get endpoint for #{role} on #{host}, did the container boot?" if endpoint.empty?
execute *KAMAL.proxy.deploy(role.container_prefix, target: endpoint)
end
end
end
end
@@ -52,8 +61,19 @@ class Kamal::Cli::App < Kamal::Cli::Base
roles = KAMAL.roles_on(host)
roles.each do |role|
app = KAMAL.app(role: role, host: host)
version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
execute *KAMAL.auditor.record("Stopped app", role: role), verbosity: :debug
execute *KAMAL.app(role: role, host: host).stop, raise_on_non_zero_exit: false
if role.running_proxy?
endpoint = capture_with_info(*app.container_endpoint(version: version)).strip
if endpoint.present?
execute *KAMAL.proxy.remove(role.container_prefix, target: endpoint), raise_on_non_zero_exit: false
end
end
execute *app.stop, raise_on_non_zero_exit: false
end
end
end

View File

@@ -1,4 +1,4 @@
class Kamal::Cli::Healthcheck::Barrier
class Kamal::Cli::App::Barrier
def initialize
@ivar = Concurrent::IVar.new
end
@@ -13,7 +13,7 @@ class Kamal::Cli::Healthcheck::Barrier
def wait
unless opened?
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
raise Kamal::Cli::BootError.new("Halted at barrier")
end
end

View File

@@ -1,7 +1,7 @@
class Kamal::Cli::App::Boot
attr_reader :host, :role, :version, :barrier, :sshkit
delegate :execute, :capture_with_info, :capture_with_pretty_json, :info, :error, to: :sshkit
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
delegate :assets?, :running_proxy?, to: :role
def initialize(host, role, sshkit, version, barrier)
@host = host
@@ -45,11 +45,13 @@ class Kamal::Cli::App::Boot
def start_new_version
audit "Booted app version #{version}"
execute *app.tie_cord(role.cord_host_file) if uses_cord?
hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}"
execute *app.run(hostname: hostname)
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
if running_proxy?
endpoint = capture_with_info(*app.container_endpoint(version: version)).strip
raise Kamal::Cli::BootError, "Failed to get endpoint for #{role} on #{host}, did the container boot?" if endpoint.empty?
execute *KAMAL.proxy.deploy(role.container_prefix, target: endpoint)
end
end
def stop_new_version
@@ -57,16 +59,7 @@ class Kamal::Cli::App::Boot
end
def stop_old_version(version)
if uses_cord?
cord = capture_with_info(*app.cord(version: version), raise_on_non_zero_exit: false).strip
if cord.present?
execute *app.cut_cord(cord)
Kamal::Cli::Healthcheck::Poller.wait_for_unhealthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
end
end
execute *app.stop(version: version), raise_on_non_zero_exit: false
execute *app.clean_up_assets if assets?
end
@@ -80,7 +73,7 @@ class Kamal::Cli::App::Boot
info "Waiting for the first healthy #{KAMAL.primary_role} container before booting #{role} on #{host}..."
barrier.wait
info "First #{KAMAL.primary_role} container is healthy, booting #{role} on #{host}..."
rescue Kamal::Cli::Healthcheck::Error
rescue Kamal::Cli::BootError
info "First #{KAMAL.primary_role} container is unhealthy, not booting #{role} on #{host}"
raise
end
@@ -89,7 +82,6 @@ class Kamal::Cli::App::Boot
if barrier.close
info "First #{KAMAL.primary_role} container is unhealthy on #{host}, not booting other roles"
error capture_with_info(*app.logs(version: version))
error capture_with_info(*app.container_health_log(version: version))
end
end

View File

@@ -13,11 +13,6 @@ class Kamal::Cli::Env < Kamal::Cli::Base
end
end
on(KAMAL.traefik_hosts) do
execute *KAMAL.traefik.make_env_directory
upload! KAMAL.traefik.env.secrets_io, KAMAL.traefik.env.secrets_file, mode: 400
end
on(KAMAL.accessory_hosts) do
KAMAL.accessories_on(host).each do |accessory|
accessory_config = KAMAL.config.accessory(accessory)
@@ -39,10 +34,6 @@ class Kamal::Cli::Env < Kamal::Cli::Base
end
end
on(KAMAL.traefik_hosts) do
execute *KAMAL.traefik.remove_env_file
end
on(KAMAL.accessory_hosts) do
KAMAL.accessories_on(host).each do |accessory|
accessory_config = KAMAL.config.accessory(accessory)

View File

@@ -1,2 +0,0 @@
class Kamal::Cli::Healthcheck::Error < StandardError
end

View File

@@ -1,63 +0,0 @@
module Kamal::Cli::Healthcheck::Poller
extend self
TRAEFIK_UPDATE_DELAY = 5
def wait_for_healthy(pause_after_ready: false, &block)
attempt = 1
max_attempts = KAMAL.config.healthcheck["max_attempts"]
begin
case status = block.call
when "healthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
when "running" # No health check configured
sleep KAMAL.config.readiness_delay if pause_after_ready
else
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
end
rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1
retry
else
raise
end
end
info "Container is healthy!"
end
def wait_for_unhealthy(pause_after_ready: false, &block)
attempt = 1
max_attempts = KAMAL.config.healthcheck["max_attempts"]
begin
case status = block.call
when "unhealthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
else
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
end
rescue Kamal::Cli::Healthcheck::Error => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1
retry
else
raise
end
end
info "Container is unhealthy!"
end
private
def info(message)
SSHKit.config.output.info(message)
end
end

View File

@@ -38,8 +38,8 @@ class Kamal::Cli::Main < Kamal::Cli::Base
with_lock do
run_hook "pre-deploy"
say "Ensure Traefik is running...", :magenta
invoke "kamal:cli:traefik:boot", [], invoke_options
say "Ensure proxy is running...", :magenta
invoke "kamal:cli:proxy:boot", [], invoke_options
say "Detect stale containers...", :magenta
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
@@ -54,7 +54,7 @@ class Kamal::Cli::Main < Kamal::Cli::Base
run_hook "post-deploy", runtime: runtime.round
end
desc "redeploy", "Deploy app to servers without bootstrapping servers, starting Traefik, pruning, and registry login"
desc "redeploy", "Deploy app to servers without bootstrapping servers, starting proxy, pruning, and registry login"
option :skip_push, aliases: "-P", type: :boolean, default: false, desc: "Skip image build and push"
def redeploy
runtime = print_runtime do
@@ -107,7 +107,7 @@ class Kamal::Cli::Main < Kamal::Cli::Base
desc "details", "Show details about all containers"
def details
invoke "kamal:cli:traefik:details"
invoke "kamal:cli:proxy:details"
invoke "kamal:cli:app:details"
invoke "kamal:cli:accessory:details", [ "all" ]
end
@@ -189,12 +189,12 @@ class Kamal::Cli::Main < Kamal::Cli::Base
end
end
desc "remove", "Remove Traefik, app, accessories, and registry session from servers"
desc "remove", "Remove proxy, app, accessories, and registry session from servers"
option :confirmed, aliases: "-y", type: :boolean, default: false, desc: "Proceed without confirmation question"
def remove
confirming "This will remove all containers and images. Are you sure?" do
with_lock do
invoke "kamal:cli:traefik:remove", [], options.without(:confirmed)
invoke "kamal:cli:proxy:remove", [], options.without(:confirmed)
invoke "kamal:cli:app:remove", [], options.without(:confirmed)
invoke "kamal:cli:accessory:remove", [ "all" ], options
invoke "kamal:cli:registry:logout", [], options.without(:confirmed)
@@ -231,8 +231,8 @@ class Kamal::Cli::Main < Kamal::Cli::Base
desc "server", "Bootstrap servers with curl and Docker"
subcommand "server", Kamal::Cli::Server
desc "traefik", "Manage Traefik load balancer"
subcommand "traefik", Kamal::Cli::Traefik
desc "proxy", "Manage load balancer proxy"
subcommand "proxy", Kamal::Cli::Proxy
private
def container_available?(version)

120
lib/kamal/cli/proxy.rb Normal file
View File

@@ -0,0 +1,120 @@
class Kamal::Cli::Proxy < Kamal::Cli::Base
desc "boot", "Boot proxy on servers"
def boot
with_lock do
on(KAMAL.proxy_hosts) do
execute *KAMAL.registry.login
execute *KAMAL.proxy.start_or_run
end
end
end
desc "reboot", "Reboot proxy on servers (stop container, remove container, start new container)"
option :rolling, type: :boolean, default: false, desc: "Reboot proxy on hosts in sequence, rather than in parallel"
option :confirmed, aliases: "-y", type: :boolean, default: false, desc: "Proceed without confirmation question"
def reboot
confirming "This will cause a brief outage on each host. Are you sure?" do
with_lock do
host_groups = options[:rolling] ? KAMAL.proxy_hosts : [ KAMAL.proxy_hosts ]
host_groups.each do |hosts|
host_list = Array(hosts).join(",")
run_hook "pre-proxy-reboot", hosts: host_list
on(hosts) do
execute *KAMAL.auditor.record("Rebooted proxy"), verbosity: :debug
execute *KAMAL.registry.login
execute *KAMAL.proxy.stop, raise_on_non_zero_exit: false
execute *KAMAL.proxy.remove_container
execute *KAMAL.proxy.run
end
run_hook "post-proxy-reboot", hosts: host_list
end
end
end
end
desc "start", "Start existing proxy container on servers"
def start
with_lock do
on(KAMAL.proxy_hosts) do
execute *KAMAL.auditor.record("Started proxy"), verbosity: :debug
execute *KAMAL.proxy.start
end
end
end
desc "stop", "Stop existing proxy container on servers"
def stop
with_lock do
on(KAMAL.proxy_hosts) do
execute *KAMAL.auditor.record("Stopped proxy"), verbosity: :debug
execute *KAMAL.proxy.stop, raise_on_non_zero_exit: false
end
end
end
desc "restart", "Restart existing proxy container on servers"
def restart
with_lock do
stop
start
end
end
desc "details", "Show details about proxy container from servers"
def details
on(KAMAL.proxy_hosts) { |host| puts_by_host host, capture_with_info(*KAMAL.proxy.info), type: "Proxy" }
end
desc "logs", "Show log lines from proxy on servers"
option :since, aliases: "-s", desc: "Show logs since timestamp (e.g. 2013-01-02T13:23:37Z) or relative (e.g. 42m for 42 minutes)"
option :lines, type: :numeric, aliases: "-n", desc: "Number of log lines to pull from each server"
option :grep, aliases: "-g", desc: "Show lines with grep match only (use this to fetch specific requests by id)"
option :follow, aliases: "-f", desc: "Follow logs on primary server (or specific host set by --hosts)"
def logs
grep = options[:grep]
if options[:follow]
run_locally do
info "Following logs on #{KAMAL.primary_host}..."
info KAMAL.proxy.follow_logs(host: KAMAL.primary_host, grep: grep)
exec KAMAL.proxy.follow_logs(host: KAMAL.primary_host, grep: grep)
end
else
since = options[:since]
lines = options[:lines].presence || ((since || grep) ? nil : 100) # Default to 100 lines if since or grep isn't set
on(KAMAL.proxy_hosts) do |host|
puts_by_host host, capture(*KAMAL.proxy.logs(since: since, lines: lines, grep: grep)), type: "Proxy"
end
end
end
desc "remove", "Remove proxy container and image from servers"
def remove
with_lock do
stop
remove_container
remove_image
end
end
desc "remove_container", "Remove proxy container from servers", hide: true
def remove_container
with_lock do
on(KAMAL.proxy_hosts) do
execute *KAMAL.auditor.record("Removed proxy container"), verbosity: :debug
execute *KAMAL.proxy.remove_container
end
end
end
desc "remove_image", "Remove proxy image from servers", hide: true
def remove_image
with_lock do
on(KAMAL.proxy_hosts) do
execute *KAMAL.auditor.record("Removed proxy image"), verbosity: :debug
execute *KAMAL.proxy.remove_image
end
end
end
end

View File

@@ -28,7 +28,6 @@ class Kamal::Cli::Prune < Kamal::Cli::Base
on(KAMAL.hosts) do
execute *KAMAL.auditor.record("Pruned containers"), verbosity: :debug
execute *KAMAL.prune.app_containers(retain: retain)
execute *KAMAL.prune.healthcheck_containers
end
end
end

View File

@@ -63,17 +63,6 @@ registry:
# directories:
# - data:/data
# Configure custom arguments for Traefik. Be sure to reboot traefik when you modify it.
# traefik:
# args:
# accesslog: true
# accesslog.format: json
# Configure a custom healthcheck (default is /up on port 3000)
# healthcheck:
# path: /healthz
# port: 4000
# Bridge fingerprinted assets, like JS and CSS, between versions to avoid
# hitting 404 on in-flight requests. Combines all files from new and old
# version inside the asset_path.

View File

@@ -0,0 +1,3 @@
#!/bin/sh
echo "Rebooted proxy on $KAMAL_HOSTS"

View File

@@ -1,3 +0,0 @@
#!/bin/sh
echo "Rebooted Traefik on $KAMAL_HOSTS"

View File

@@ -0,0 +1,3 @@
#!/bin/sh
echo "Rebooting proxy on $KAMAL_HOSTS..."

View File

@@ -1,3 +0,0 @@
#!/bin/sh
echo "Rebooting Traefik on $KAMAL_HOSTS..."

View File

@@ -1,120 +0,0 @@
class Kamal::Cli::Traefik < Kamal::Cli::Base
desc "boot", "Boot Traefik on servers"
def boot
with_lock do
on(KAMAL.traefik_hosts) do
execute *KAMAL.registry.login
execute *KAMAL.traefik.start_or_run
end
end
end
desc "reboot", "Reboot Traefik on servers (stop container, remove container, start new container)"
option :rolling, type: :boolean, default: false, desc: "Reboot traefik on hosts in sequence, rather than in parallel"
option :confirmed, aliases: "-y", type: :boolean, default: false, desc: "Proceed without confirmation question"
def reboot
confirming "This will cause a brief outage on each host. Are you sure?" do
with_lock do
host_groups = options[:rolling] ? KAMAL.traefik_hosts : [ KAMAL.traefik_hosts ]
host_groups.each do |hosts|
host_list = Array(hosts).join(",")
run_hook "pre-traefik-reboot", hosts: host_list
on(hosts) do
execute *KAMAL.auditor.record("Rebooted traefik"), verbosity: :debug
execute *KAMAL.registry.login
execute *KAMAL.traefik.stop, raise_on_non_zero_exit: false
execute *KAMAL.traefik.remove_container
execute *KAMAL.traefik.run
end
run_hook "post-traefik-reboot", hosts: host_list
end
end
end
end
desc "start", "Start existing Traefik container on servers"
def start
with_lock do
on(KAMAL.traefik_hosts) do
execute *KAMAL.auditor.record("Started traefik"), verbosity: :debug
execute *KAMAL.traefik.start
end
end
end
desc "stop", "Stop existing Traefik container on servers"
def stop
with_lock do
on(KAMAL.traefik_hosts) do
execute *KAMAL.auditor.record("Stopped traefik"), verbosity: :debug
execute *KAMAL.traefik.stop, raise_on_non_zero_exit: false
end
end
end
desc "restart", "Restart existing Traefik container on servers"
def restart
with_lock do
stop
start
end
end
desc "details", "Show details about Traefik container from servers"
def details
on(KAMAL.traefik_hosts) { |host| puts_by_host host, capture_with_info(*KAMAL.traefik.info), type: "Traefik" }
end
desc "logs", "Show log lines from Traefik on servers"
option :since, aliases: "-s", desc: "Show logs since timestamp (e.g. 2013-01-02T13:23:37Z) or relative (e.g. 42m for 42 minutes)"
option :lines, type: :numeric, aliases: "-n", desc: "Number of log lines to pull from each server"
option :grep, aliases: "-g", desc: "Show lines with grep match only (use this to fetch specific requests by id)"
option :follow, aliases: "-f", desc: "Follow logs on primary server (or specific host set by --hosts)"
def logs
grep = options[:grep]
if options[:follow]
run_locally do
info "Following logs on #{KAMAL.primary_host}..."
info KAMAL.traefik.follow_logs(host: KAMAL.primary_host, grep: grep)
exec KAMAL.traefik.follow_logs(host: KAMAL.primary_host, grep: grep)
end
else
since = options[:since]
lines = options[:lines].presence || ((since || grep) ? nil : 100) # Default to 100 lines if since or grep isn't set
on(KAMAL.traefik_hosts) do |host|
puts_by_host host, capture(*KAMAL.traefik.logs(since: since, lines: lines, grep: grep)), type: "Traefik"
end
end
end
desc "remove", "Remove Traefik container and image from servers"
def remove
with_lock do
stop
remove_container
remove_image
end
end
desc "remove_container", "Remove Traefik container from servers", hide: true
def remove_container
with_lock do
on(KAMAL.traefik_hosts) do
execute *KAMAL.auditor.record("Removed traefik container"), verbosity: :debug
execute *KAMAL.traefik.remove_container
end
end
end
desc "remove_image", "Remove Traefik image from servers", hide: true
def remove_image
with_lock do
on(KAMAL.traefik_hosts) do
execute *KAMAL.auditor.record("Removed traefik image"), verbosity: :debug
execute *KAMAL.traefik.remove_image
end
end
end
end