From 0efb5ccfffe4faf5553782768d328a89abb88706 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Thu, 21 Mar 2024 11:36:21 +0000 Subject: [PATCH 1/9] Remove the healthcheck step To speed up deployments, we'll remove the healthcheck step. This adds some risk to deployments for non-web roles - if they don't have a Docker healthcheck configured then the only check we do is if the container is running. If there is a bad image we might see the container running before it exits and deploy it. Previously the healthcheck step would have avoided this by ensuring a web container could boot and serve traffic first. To mitigate this, we'll add a deployment barrier. Until one of the primary role containers passes its healthcheck, we'll keep the barrier up and avoid stopping the containers on the non-primary roles. It the primary role container fails its healthcheck, we'll close the barrier and shut down the new containers on the waiting roles. We also have a new integration test to check we correctly handle a a broken image. This highlighted that SSHKit's default runner will stop at the first error it encounters. We'll now have a custom runner that waits for all threads to finish allowing them to clean up. --- lib/kamal/cli/app.rb | 9 +- lib/kamal/cli/app/boot.rb | 56 +++++++- lib/kamal/cli/healthcheck.rb | 21 --- lib/kamal/cli/healthcheck/barrier.rb | 31 +++++ lib/kamal/cli/healthcheck/error.rb | 2 + lib/kamal/cli/healthcheck/poller.rb | 9 +- lib/kamal/cli/main.rb | 11 -- lib/kamal/commander.rb | 1 + lib/kamal/commands/healthcheck.rb | 59 --------- lib/kamal/configuration.rb | 2 +- lib/kamal/sshkit_with_ext.rb | 31 +++++ test/cli/app_test.rb | 64 ++++++++- test/cli/healthcheck_test.rb | 82 ------------ test/cli/main_test.rb | 12 -- test/commander_test.rb | 5 + test/commands/healthcheck_test.rb | 124 ------------------ test/configuration_test.rb | 2 +- .../deploy_with_two_roles_one_host.yml | 15 +++ test/integration/broken_deploy_test.rb | 24 ++++ .../docker/deployer/app/config/deploy.yml | 2 + .../deployer/app_with_roles/config/deploy.yml | 2 + test/integration/docker/deployer/break_app.sh | 3 + test/integration/integration_test.rb | 17 +++ test/integration/main_test.rb | 12 +- 24 files changed, 269 insertions(+), 327 deletions(-) delete mode 100644 lib/kamal/cli/healthcheck.rb create mode 100644 lib/kamal/cli/healthcheck/barrier.rb create mode 100644 lib/kamal/cli/healthcheck/error.rb delete mode 100644 lib/kamal/commands/healthcheck.rb delete mode 100644 test/cli/healthcheck_test.rb delete mode 100644 test/commands/healthcheck_test.rb create mode 100644 test/fixtures/deploy_with_two_roles_one_host.yml create mode 100644 test/integration/broken_deploy_test.rb create mode 100755 test/integration/docker/deployer/break_app.sh diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index 188d5e9b..ae84e368 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -14,9 +14,12 @@ class Kamal::Cli::App < Kamal::Cli::Base end end + barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many? + on(KAMAL.hosts, **KAMAL.boot_strategy) do |host| + # Ensure primary role is booted first to allow the web barrier to be opened KAMAL.roles_on(host).each do |role| - Kamal::Cli::App::Boot.new(host, role, version, self).run + Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run end end @@ -284,4 +287,8 @@ class Kamal::Cli::App < Kamal::Cli::Base def version_or_latest options[:version] || KAMAL.config.latest_tag end + + def web_and_non_web_roles? + KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?) + end end diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index 78670104..de219a23 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -1,12 +1,13 @@ class Kamal::Cli::App::Boot - attr_reader :host, :role, :version, :sshkit + attr_reader :host, :role, :version, :barrier, :sshkit delegate :execute, :capture_with_info, :info, to: :sshkit - delegate :uses_cord?, :assets?, to: :role + delegate :uses_cord?, :assets?, :running_traefik?, to: :role - def initialize(host, role, version, sshkit) + def initialize(host, role, sshkit, version, barrier) @host = host @role = role @version = version + @barrier = barrier @sshkit = sshkit end @@ -46,10 +47,18 @@ class Kamal::Cli::App::Boot def start_new_version audit "Booted app version #{version}" + execute *app.tie_cord(role.cord_host_file) if uses_cord? hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}" execute *app.run(hostname: hostname) Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } + + reach_barrier + rescue => e + close_barrier if barrier_role? + execute *app.stop(version: version), raise_on_non_zero_exit: false + + raise end def stop_old_version(version) @@ -65,4 +74,45 @@ class Kamal::Cli::App::Boot execute *app.clean_up_assets if assets? end + + def reach_barrier + if barrier + if barrier_role? + if barrier.open + info "Opened barrier (#{host})" + end + else + wait_for_barrier + end + end + end + + def wait_for_barrier + info "Waiting at web barrier (#{host})..." + barrier.wait + info "Barrier opened (#{host})" + rescue Kamal::Cli::Healthcheck::Error + info "Barrier closed, shutting down new container... (#{host})" + raise + end + + def close_barrier + barrier&.close + end + + def barrier_role? + role == KAMAL.primary_role + end + + def app + @app ||= KAMAL.app(role: role) + end + + def auditor + @auditor = KAMAL.auditor(role: role) + end + + def audit(message) + execute *auditor.record(message), verbosity: :debug + end end diff --git a/lib/kamal/cli/healthcheck.rb b/lib/kamal/cli/healthcheck.rb deleted file mode 100644 index b3dbf6bd..00000000 --- a/lib/kamal/cli/healthcheck.rb +++ /dev/null @@ -1,21 +0,0 @@ -class Kamal::Cli::Healthcheck < Kamal::Cli::Base - default_command :perform - - desc "perform", "Health check current app version" - def perform - raise "The primary host is not configured to run Traefik" unless KAMAL.config.role(KAMAL.config.primary_role).running_traefik? - on(KAMAL.primary_host) do - begin - execute *KAMAL.healthcheck.run - Poller.wait_for_healthy { capture_with_info(*KAMAL.healthcheck.status) } - rescue Poller::HealthcheckError => e - error capture_with_info(*KAMAL.healthcheck.logs) - error capture_with_pretty_json(*KAMAL.healthcheck.container_health_log) - raise - ensure - execute *KAMAL.healthcheck.stop, raise_on_non_zero_exit: false - execute *KAMAL.healthcheck.remove, raise_on_non_zero_exit: false - end - end - end -end diff --git a/lib/kamal/cli/healthcheck/barrier.rb b/lib/kamal/cli/healthcheck/barrier.rb new file mode 100644 index 00000000..0fbfb511 --- /dev/null +++ b/lib/kamal/cli/healthcheck/barrier.rb @@ -0,0 +1,31 @@ +class Kamal::Cli::Healthcheck::Barrier + def initialize + @ivar = Concurrent::IVar.new + end + + def close + set(false) + end + + def open + set(true) + end + + def wait + unless opened? + raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier") + end + end + + private + def opened? + @ivar.value + end + + def set(value) + @ivar.set(value) + true + rescue Concurrent::MultipleAssignmentError + false + end +end diff --git a/lib/kamal/cli/healthcheck/error.rb b/lib/kamal/cli/healthcheck/error.rb new file mode 100644 index 00000000..8824a72e --- /dev/null +++ b/lib/kamal/cli/healthcheck/error.rb @@ -0,0 +1,2 @@ +class Kamal::Cli::Healthcheck::Error < StandardError +end diff --git a/lib/kamal/cli/healthcheck/poller.rb b/lib/kamal/cli/healthcheck/poller.rb index 9d91adfc..06898b1c 100644 --- a/lib/kamal/cli/healthcheck/poller.rb +++ b/lib/kamal/cli/healthcheck/poller.rb @@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller TRAEFIK_UPDATE_DELAY = 5 - class HealthcheckError < StandardError; end def wait_for_healthy(pause_after_ready: false, &block) attempt = 1 @@ -16,9 +15,9 @@ module Kamal::Cli::Healthcheck::Poller when "running" # No health check configured sleep KAMAL.config.readiness_delay if pause_after_ready else - raise HealthcheckError, "container not ready (#{status})" + raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})" end - rescue HealthcheckError => e + rescue Kamal::Cli::Healthcheck::Error => e if attempt <= max_attempts info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..." sleep attempt @@ -41,9 +40,9 @@ module Kamal::Cli::Healthcheck::Poller when "unhealthy" sleep TRAEFIK_UPDATE_DELAY if pause_after_ready else - raise HealthcheckError, "container not unhealthy (#{status})" + raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})" end - rescue HealthcheckError => e + rescue Kamal::Cli::Healthcheck::Error => e if attempt <= max_attempts info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..." sleep attempt diff --git a/lib/kamal/cli/main.rb b/lib/kamal/cli/main.rb index 1cc9d98e..594fb3fd 100644 --- a/lib/kamal/cli/main.rb +++ b/lib/kamal/cli/main.rb @@ -42,11 +42,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base say "Ensure Traefik is running...", :magenta invoke "kamal:cli:traefik:boot", [], invoke_options - if KAMAL.config.role(KAMAL.config.primary_role).running_traefik? - say "Ensure app can pass healthcheck...", :magenta - invoke "kamal:cli:healthcheck:perform", [], invoke_options - end - say "Detect stale containers...", :magenta invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true) @@ -77,9 +72,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base run_hook "pre-deploy" - say "Ensure app can pass healthcheck...", :magenta - invoke "kamal:cli:healthcheck:perform", [], invoke_options - say "Detect stale containers...", :magenta invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true) @@ -228,9 +220,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base desc "env", "Manage environment files" subcommand "env", Kamal::Cli::Env - desc "healthcheck", "Healthcheck application" - subcommand "healthcheck", Kamal::Cli::Healthcheck - desc "lock", "Manage the deploy lock" subcommand "lock", Kamal::Cli::Lock diff --git a/lib/kamal/commander.rb b/lib/kamal/commander.rb index e7c5d21f..f4c54215 100644 --- a/lib/kamal/commander.rb +++ b/lib/kamal/commander.rb @@ -150,6 +150,7 @@ class Kamal::Commander sshkit.max_concurrent_starts = config.sshkit.max_concurrent_starts sshkit.ssh_options = config.ssh.options end + SSHKit.config.default_runner = SSHKit::Runner::ParallelCompleteAll SSHKit.config.command_map[:docker] = "docker" # No need to use /usr/bin/env, just clogs up the logs SSHKit.config.output_verbosity = verbosity end diff --git a/lib/kamal/commands/healthcheck.rb b/lib/kamal/commands/healthcheck.rb deleted file mode 100644 index 2517a5e2..00000000 --- a/lib/kamal/commands/healthcheck.rb +++ /dev/null @@ -1,59 +0,0 @@ -class Kamal::Commands::Healthcheck < Kamal::Commands::Base - def run - primary = config.role(config.primary_role) - - docker :run, - "--detach", - "--name", container_name_with_version, - "--publish", "#{exposed_port}:#{config.healthcheck["port"]}", - "--label", "service=#{config.healthcheck_service}", - "-e", "KAMAL_CONTAINER_NAME=\"#{config.healthcheck_service}\"", - *primary.env_args(config.primary_host), - *primary.health_check_args(cord: false), - *config.volume_args, - *primary.option_args, - config.absolute_image, - primary.cmd - end - - def status - pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT)) - end - - def container_health_log - pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT)) - end - - def logs - pipe container_id, xargs(docker(:logs, "--tail", log_lines, "2>&1")) - end - - def stop - pipe container_id, xargs(docker(:stop)) - end - - def remove - pipe container_id, xargs(docker(:container, :rm)) - end - - private - def container_name_with_version - "#{config.healthcheck_service}-#{config.version}" - end - - def container_id - container_id_for(container_name: container_name_with_version) - end - - def health_url - "http://localhost:#{exposed_port}#{config.healthcheck["path"]}" - end - - def exposed_port - config.healthcheck["exposed_port"] - end - - def log_lines - config.healthcheck["log_lines"] - end -end diff --git a/lib/kamal/configuration.rb b/lib/kamal/configuration.rb index 4957be17..bed91cf6 100644 --- a/lib/kamal/configuration.rb +++ b/lib/kamal/configuration.rb @@ -188,7 +188,7 @@ class Kamal::Configuration def healthcheck - { "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {}) + { "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {}) end def healthcheck_service diff --git a/lib/kamal/sshkit_with_ext.rb b/lib/kamal/sshkit_with_ext.rb index e0c62c3a..c556774b 100644 --- a/lib/kamal/sshkit_with_ext.rb +++ b/lib/kamal/sshkit_with_ext.rb @@ -103,3 +103,34 @@ class SSHKit::Backend::Netssh prepend LimitConcurrentStartsInstance end + +require "thread" + +module SSHKit + module Runner + class ParallelCompleteAll < Abstract + def execute + threads = hosts.map do |host| + Thread.new(host) do |h| + begin + backend(h, &block).run + rescue ::StandardError => e + e2 = SSHKit::Runner::ExecuteError.new e + raise e2, "Exception while executing #{host.user ? "as #{host.user}@" : "on host "}#{host}: #{e.message}" + end + end + end + + exception = nil + threads.each do |t| + begin + t.join + rescue SSHKit::Runner::ExecuteError => e + exception ||= e + end + end + raise exception if exception + end + end + end +end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index c75a86cc..ced41317 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -118,6 +118,62 @@ class CliAppTest < CliTestCase end end + test "boot with web barrier opened" do + Object.any_instance.stubs(:sleep) + + SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("running").at_least_once # web health check passing + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("unhealthy").at_least_once # web health check failing + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("running").at_least_once # workers health check + + run_command("boot", config: :with_roles, host: nil).tap do |output| + assert_match "Waiting at web barrier (1.1.1.3)...", output + assert_match "Waiting at web barrier (1.1.1.4)...", output + assert_match "Barrier opened (1.1.1.3)", output + assert_match "Barrier opened (1.1.1.4)", output + end + end + + test "boot with web barrier closed" do + Thread.report_on_exception = false + + Object.any_instance.stubs(:sleep) + + SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("unhealthy").at_least_once # web health check failing + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("running").at_least_once # workers health check passing + + stderred do + run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| + assert_match "Waiting at web barrier (1.1.1.3)...", output + assert_match "Waiting at web barrier (1.1.1.4)...", output + assert_match "Barrier closed, shutting down new container... (1.1.1.3)", output + assert_match "Barrier closed, shutting down new container... (1.1.1.4)", output + assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output + assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output + assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.3", output + assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.4", output + end + end + ensure + Thread.report_on_exception = true + end + test "start" do run_command("start").tap do |output| assert_match "docker start app-web-999", output @@ -283,8 +339,12 @@ class CliAppTest < CliTestCase end private - def run_command(*command, config: :with_accessories) - stdouted { Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", "--hosts", "1.1.1.1" ]) } + def run_command(*command, config: :with_accessories, host: "1.1.1.1", allow_execute_error: false) + stdouted do + Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", *([ "--hosts", host ] if host) ]) + rescue SSHKit::Runner::ExecuteError => e + raise e unless allow_execute_error + end end def stub_running diff --git a/test/cli/healthcheck_test.rb b/test/cli/healthcheck_test.rb deleted file mode 100644 index e273bf2e..00000000 --- a/test/cli/healthcheck_test.rb +++ /dev/null @@ -1,82 +0,0 @@ -require_relative "cli_test_case" - -class CliHealthcheckTest < CliTestCase - test "perform" do - # Prevent expected failures from outputting to terminal - Thread.report_on_exception = false - - Kamal::Cli::Healthcheck::Poller.stubs(:sleep) # No sleeping when retrying - Kamal::Configuration.any_instance.stubs(:run_id).returns("12345678901234567890123456789012") - - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false) - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "KAMAL_CONTAINER_NAME=\"healthcheck-app\"", "--env-file", ".kamal/env/roles/app-web.env", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999") - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false) - - # Fail twice to test retry logic - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") - .returns("starting") - .then - .returns("unhealthy") - .then - .returns("healthy") - - run_command("perform").tap do |output| - assert_match "container not ready (starting), retrying in 1s (attempt 1/7)...", output - assert_match "container not ready (unhealthy), retrying in 2s (attempt 2/7)...", output - assert_match "Container is healthy!", output - end - end - - test "perform failing to become healthy" do - # Prevent expected failures from outputting to terminal - Thread.report_on_exception = false - - Kamal::Cli::Healthcheck::Poller.stubs(:sleep) # No sleeping when retrying - - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false) - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "KAMAL_CONTAINER_NAME=\"healthcheck-app\"", "--env-file", ".kamal/env/roles/app-web.env", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999") - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false) - - # Continually report unhealthy - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") - .returns("unhealthy") - - # Capture logs when failing - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :logs, "--tail", 50, "2>&1") - .returns("some log output") - - # Capture container health log when failing - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_pretty_json) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{json .State.Health}}'") - .returns('{"Status":"unhealthy","Log":[{"ExitCode": 1,"Output": "/bin/sh: 1: curl: not found\n"}]}"') - - exception = assert_raises do - run_command("perform") - end - assert_match "container not ready (unhealthy)", exception.message - end - - test "raises an exception if primary does not have traefik" do - SSHKit::Backend::Abstract.any_instance.expects(:execute).never - - exception = assert_raises do - run_command("perform", config_file: "test/fixtures/deploy_workers_only.yml") - end - - assert_equal "The primary host is not configured to run Traefik", exception.message - end - - private - def run_command(*command, config_file: "test/fixtures/deploy_with_accessories.yml") - stdouted { Kamal::Cli::Healthcheck.start([ *command, "-c", config_file ]) } - end -end diff --git a/test/cli/main_test.rb b/test/cli/main_test.rb index 6149cf0b..3f7c5215 100644 --- a/test/cli/main_test.rb +++ b/test/cli/main_test.rb @@ -27,7 +27,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -40,7 +39,6 @@ class CliMainTest < CliTestCase assert_match /Log into image registry/, output assert_match /Pull app image/, output assert_match /Ensure Traefik is running/, output - assert_match /Ensure app can pass healthcheck/, output assert_match /Detect stale containers/, output assert_match /Prune old containers and images/, output assert_match /Releasing the deploy lock/, output @@ -53,7 +51,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -67,7 +64,6 @@ class CliMainTest < CliTestCase assert_match /Build and push app image/, output assert_hook_ran "pre-deploy", output, **hook_variables assert_match /Ensure Traefik is running/, output - assert_match /Ensure app can pass healthcheck/, output assert_match /Detect stale containers/, output assert_match /Prune old containers and images/, output assert_hook_ran "post-deploy", output, **hook_variables, runtime: true @@ -80,7 +76,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -90,7 +85,6 @@ class CliMainTest < CliTestCase assert_match /Log into image registry/, output assert_match /Pull app image/, output assert_match /Ensure Traefik is running/, output - assert_match /Ensure app can pass healthcheck/, output assert_match /Detect stale containers/, output assert_match /Prune old containers and images/, output assert_match /Releasing the deploy lock/, output @@ -156,7 +150,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -187,7 +180,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -199,7 +191,6 @@ class CliMainTest < CliTestCase invoke_options = { "config_file" => "test/fixtures/deploy_simple.yml", "version" => "999", "skip_hooks" => false, "verbose" => true } Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) @@ -212,7 +203,6 @@ class CliMainTest < CliTestCase assert_match /Build and push app image/, output assert_hook_ran "pre-deploy", output, **hook_variables assert_match /Running the pre-deploy hook.../, output - assert_match /Ensure app can pass healthcheck/, output assert_hook_ran "post-deploy", output, **hook_variables, runtime: true end end @@ -221,13 +211,11 @@ class CliMainTest < CliTestCase invoke_options = { "config_file" => "test/fixtures/deploy_simple.yml", "version" => "999", "skip_hooks" => false } Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) run_command("redeploy", "--skip_push").tap do |output| assert_match /Pull app image/, output - assert_match /Ensure app can pass healthcheck/, output end end diff --git a/test/commander_test.rb b/test/commander_test.rb index 2bbdb479..6a7ec536 100644 --- a/test/commander_test.rb +++ b/test/commander_test.rb @@ -99,6 +99,11 @@ class CommanderTest < ActiveSupport::TestCase assert_equal [ "workers" ], @kamal.roles_on("1.1.1.3").map(&:name) end + test "roles_on web comes first" do + configure_with(:deploy_with_two_roles_one_host) + assert_equal [ "web", "workers" ], @kamal.roles_on("1.1.1.1").map(&:name) + end + test "default group strategy" do assert_empty @kamal.boot_strategy end diff --git a/test/commands/healthcheck_test.rb b/test/commands/healthcheck_test.rb deleted file mode 100644 index 829aa0b2..00000000 --- a/test/commands/healthcheck_test.rb +++ /dev/null @@ -1,124 +0,0 @@ -require "test_helper" - -class CommandsHealthcheckTest < ActiveSupport::TestCase - setup do - @config = { - service: "app", image: "dhh/app", registry: { "username" => "dhh", "password" => "secret" }, servers: [ "1.1.1.1" ], - traefik: { "args" => { "accesslog.format" => "json", "metrics.prometheus.buckets" => "0.1,0.3,1.2,5.0" } } - } - end - - test "run" do - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with custom port" do - @config[:healthcheck] = { "port" => 3001 } - - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3001 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"curl -f http://localhost:3001/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with destination" do - @destination = "staging" - - assert_equal \ - "docker run --detach --name healthcheck-app-staging-123 --publish 3999:3000 --label service=healthcheck-app-staging -e KAMAL_CONTAINER_NAME=\"healthcheck-app-staging\" --env-file .kamal/env/roles/app-web-staging.env --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with custom healthcheck" do - @config[:healthcheck] = { "cmd" => "/bin/up" } - - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"/bin/up\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with custom options" do - @config[:servers] = { "web" => { "hosts" => [ "1.1.1.1" ], "options" => { "mount" => "somewhere" } } } - @config[:healthcheck] = { "exposed_port" => 4999 } - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 4999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --mount \"somewhere\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with tags" do - @config[:servers] = [ { "1.1.1.1" => "tag1" } ] - @config[:env] = {} - @config[:env]["tags"] = { "tag1" => { "ENV1" => "value1" } } - - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --env ENV1=\"value1\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "status" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'", - new_command.status.join(" ") - end - - test "container_health_log" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker inspect --format '{{json .State.Health}}'", - new_command.container_health_log.join(" ") - end - - test "stop" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker stop", - new_command.stop.join(" ") - end - - test "stop with destination" do - @destination = "staging" - - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-staging-123$ --quiet | xargs docker stop", - new_command.stop.join(" ") - end - - test "remove" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker container rm", - new_command.remove.join(" ") - end - - test "remove with destination" do - @destination = "staging" - - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-staging-123$ --quiet | xargs docker container rm", - new_command.remove.join(" ") - end - - test "logs" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker logs --tail 50 2>&1", - new_command.logs.join(" ") - end - - test "logs with custom lines number" do - @config[:healthcheck] = { "log_lines" => 150 } - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker logs --tail 150 2>&1", - new_command.logs.join(" ") - end - - test "logs with destination" do - @destination = "staging" - - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-staging-123$ --quiet | xargs docker logs --tail 50 2>&1", - new_command.logs.join(" ") - end - - private - def new_command - Kamal::Commands::Healthcheck.new(Kamal::Configuration.new(@config, destination: @destination, version: "123")) - end -end diff --git a/test/configuration_test.rb b/test/configuration_test.rb index 6a86b231..e7e7f151 100644 --- a/test/configuration_test.rb +++ b/test/configuration_test.rb @@ -272,7 +272,7 @@ class ConfigurationTest < ActiveSupport::TestCase volume_args: [ "--volume", "/local/path:/container/path" ], builder: {}, logging: [ "--log-opt", "max-size=\"10m\"" ], - healthcheck: { "path"=>"/up", "port"=>3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 } } + healthcheck: { "path"=>"/up", "port"=>3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 } } assert_equal expected_config, @config.to_h end diff --git a/test/fixtures/deploy_with_two_roles_one_host.yml b/test/fixtures/deploy_with_two_roles_one_host.yml new file mode 100644 index 00000000..cae05469 --- /dev/null +++ b/test/fixtures/deploy_with_two_roles_one_host.yml @@ -0,0 +1,15 @@ +service: app +image: dhh/app +servers: + workers: + hosts: + - 1.1.1.1 + web: + hosts: + - 1.1.1.1 +env: + REDIS_URL: redis://x/y +registry: + server: registry.digitalocean.com + username: user + password: pw diff --git a/test/integration/broken_deploy_test.rb b/test/integration/broken_deploy_test.rb new file mode 100644 index 00000000..06a5e3c6 --- /dev/null +++ b/test/integration/broken_deploy_test.rb @@ -0,0 +1,24 @@ +require_relative "integration_test" + +class BrokenDeployTest < IntegrationTest + test "deploying a bad image" do + @app = "app_with_roles" + + kamal :envify + + first_version = latest_app_version + + kamal :deploy + + assert_app_is_up version: first_version + assert_container_running host: :vm3, name: "app-workers-#{first_version}" + + second_version = break_app + + kamal :deploy, raise_on_error: false + + assert_app_is_up version: first_version + assert_container_running host: :vm3, name: "app-workers-#{first_version}" + assert_container_not_running host: :vm3, name: "app-workers-#{second_version}" + end +end diff --git a/test/integration/docker/deployer/app/config/deploy.yml b/test/integration/docker/deployer/app/config/deploy.yml index 38113737..397a49ca 100644 --- a/test/integration/docker/deployer/app/config/deploy.yml +++ b/test/integration/docker/deployer/app/config/deploy.yml @@ -28,6 +28,7 @@ builder: COMMIT_SHA: <%= `git rev-parse HEAD` %> healthcheck: cmd: wget -qO- http://localhost > /dev/null || exit 1 + max_attempts: 3 traefik: args: accesslog: true @@ -41,3 +42,4 @@ accessories: roles: - web stop_wait_time: 1 +readiness_delay: 0 diff --git a/test/integration/docker/deployer/app_with_roles/config/deploy.yml b/test/integration/docker/deployer/app_with_roles/config/deploy.yml index 004ffb25..2cf362c6 100644 --- a/test/integration/docker/deployer/app_with_roles/config/deploy.yml +++ b/test/integration/docker/deployer/app_with_roles/config/deploy.yml @@ -22,6 +22,7 @@ builder: COMMIT_SHA: <%= `git rev-parse HEAD` %> healthcheck: cmd: wget -qO- http://localhost > /dev/null || exit 1 + max_attempts: 3 traefik: args: accesslog: true @@ -35,3 +36,4 @@ accessories: roles: - web stop_wait_time: 1 +readiness_delay: 0 diff --git a/test/integration/docker/deployer/break_app.sh b/test/integration/docker/deployer/break_app.sh new file mode 100755 index 00000000..e8b19044 --- /dev/null +++ b/test/integration/docker/deployer/break_app.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +cd $1 && echo "bad nginx config" > default.conf && git commit -am 'Broken' diff --git a/test/integration/integration_test.rb b/test/integration/integration_test.rb index bd62c474..f8f54d42 100644 --- a/test/integration/integration_test.rb +++ b/test/integration/integration_test.rb @@ -78,6 +78,11 @@ class IntegrationTest < ActiveSupport::TestCase latest_app_version end + def break_app + deployer_exec "./break_app.sh #{@app}", workdir: "/" + latest_app_version + end + def latest_app_version deployer_exec("git rev-parse HEAD", capture: true) end @@ -131,4 +136,16 @@ class IntegrationTest < ActiveSupport::TestCase puts "Tried to get the response code again and got #{app_response.code}" end end + + def assert_container_running(host:, name:) + assert container_running?(host: host, name: name) + end + + def assert_container_not_running(host:, name:) + assert_not container_running?(host: host, name: name) + end + + def container_running?(host:, name:) + docker_compose("exec #{host} docker ps --filter=name=#{name} | tail -n+2", capture: true).tap { |x| p [ x, x.strip, x.strip.present? ] }.strip.present? + end end diff --git a/test/integration/main_test.rb b/test/integration/main_test.rb index a2dc8a0c..62857d4a 100644 --- a/test/integration/main_test.rb +++ b/test/integration/main_test.rb @@ -56,6 +56,12 @@ class MainTest < IntegrationTest assert_app_is_up version: version assert_hooks_ran "pre-connect", "pre-build", "pre-deploy", "post-deploy" assert_container_running host: :vm3, name: "app-workers-#{version}" + + second_version = update_app_rev + + kamal :redeploy + assert_app_is_up version: second_version + assert_container_running host: :vm3, name: "app-workers-#{second_version}" end test "config" do @@ -73,7 +79,7 @@ class MainTest < IntegrationTest assert_equal({ user: "root", port: 22, keepalive: true, keepalive_interval: 30, log_level: :fatal }, config[:ssh_options]) assert_equal({ "multiarch" => false, "args" => { "COMMIT_SHA" => version } }, config[:builder]) assert_equal [ "--log-opt", "max-size=\"10m\"" ], config[:logging] - assert_equal({ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord"=>"/tmp/kamal-cord", "log_lines" => 50, "cmd"=>"wget -qO- http://localhost > /dev/null || exit 1" }, config[:healthcheck]) + assert_equal({ "path" => "/up", "port" => 3000, "max_attempts" => 3, "cord"=>"/tmp/kamal-cord", "log_lines" => 50, "cmd"=>"wget -qO- http://localhost > /dev/null || exit 1" }, config[:healthcheck]) end test "setup and remove" do @@ -157,8 +163,4 @@ class MainTest < IntegrationTest assert vm1_image_ids.any? assert vm1_container_ids.any? end - - def assert_container_running(host:, name:) - assert docker_compose("exec #{host} docker ps --filter=name=#{name} -q", capture: true).strip.present? - end end From 07c56583961bcd559bde3595ffba831368dfa6fc Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Wed, 3 Apr 2024 16:15:27 +0100 Subject: [PATCH 2/9] Remove redundant method --- lib/kamal/cli/app.rb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index ae84e368..390f6a7d 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -287,8 +287,4 @@ class Kamal::Cli::App < Kamal::Cli::Base def version_or_latest options[:version] || KAMAL.config.latest_tag end - - def web_and_non_web_roles? - KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?) - end end From 5be6fa3b4e05fb2ba14eb0a991956c417d0aa359 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Wed, 3 Apr 2024 16:24:25 +0100 Subject: [PATCH 3/9] Improve comments --- lib/kamal/cli/app.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index 390f6a7d..14cf54ad 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -14,15 +14,16 @@ class Kamal::Cli::App < Kamal::Cli::Base end end + #  Primary hosts and roles are returned first, so they can open the barrier barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many? on(KAMAL.hosts, **KAMAL.boot_strategy) do |host| - # Ensure primary role is booted first to allow the web barrier to be opened KAMAL.roles_on(host).each do |role| Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run end end + #  Tag once the app booted on all hosts on(KAMAL.hosts) do |host| execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug execute *KAMAL.app.tag_latest_image From 773ba3a5ab774dd696abf6c7dfe9584a7d4aafb8 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Tue, 7 May 2024 17:07:21 +0100 Subject: [PATCH 4/9] Show container logs and healthcheck status on failure --- lib/kamal/cli/app.rb | 2 +- lib/kamal/cli/app/boot.rb | 14 +++++++------- lib/kamal/commands/app/containers.rb | 8 ++++++++ lib/kamal/commands/app/logging.rb | 4 ++-- lib/kamal/commands/base.rb | 1 - test/cli/app_test.rb | 4 ++-- test/integration/broken_deploy_test.rb | 12 +++++++++++- 7 files changed, 31 insertions(+), 14 deletions(-) diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index 14cf54ad..356b09f8 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -15,7 +15,7 @@ class Kamal::Cli::App < Kamal::Cli::Base end #  Primary hosts and roles are returned first, so they can open the barrier - barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many? + barrier = Kamal::Cli::Healthcheck::Barrier.new on(KAMAL.hosts, **KAMAL.boot_strategy) do |host| KAMAL.roles_on(host).each do |role| diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index de219a23..e9b167fe 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -1,6 +1,6 @@ class Kamal::Cli::App::Boot attr_reader :host, :role, :version, :barrier, :sshkit - delegate :execute, :capture_with_info, :info, to: :sshkit + delegate :execute, :capture_with_info, :capture_with_pretty_json, :info, :error, to: :sshkit delegate :uses_cord?, :assets?, :running_traefik?, to: :role def initialize(host, role, sshkit, version, barrier) @@ -55,7 +55,11 @@ class Kamal::Cli::App::Boot reach_barrier rescue => e - close_barrier if barrier_role? + if barrier_role? && barrier.close + info "Deploy failed, so closed barrier (#{host})" + error capture_with_info(*app.logs(version: version)) + error capture_with_info(*app.container_health_log(version: version)) + end execute *app.stop(version: version), raise_on_non_zero_exit: false raise @@ -92,14 +96,10 @@ class Kamal::Cli::App::Boot barrier.wait info "Barrier opened (#{host})" rescue Kamal::Cli::Healthcheck::Error - info "Barrier closed, shutting down new container... (#{host})" + info "Barrier closed, shutting down new container (#{host})..." raise end - def close_barrier - barrier&.close - end - def barrier_role? role == KAMAL.primary_role end diff --git a/lib/kamal/commands/app/containers.rb b/lib/kamal/commands/app/containers.rb index a62d9a35..0bab388b 100644 --- a/lib/kamal/commands/app/containers.rb +++ b/lib/kamal/commands/app/containers.rb @@ -1,4 +1,6 @@ module Kamal::Commands::App::Containers + DOCKER_HEALTH_LOG_FORMAT = "'{{json .State.Health}}'" + def list_containers docker :container, :ls, "--all", *filter_args end @@ -20,4 +22,10 @@ module Kamal::Commands::App::Containers def remove_containers docker :container, :prune, "--force", *filter_args end + + def container_health_log(version:) + pipe \ + container_id_for(container_name: container_name(version)), + xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT)) + end end diff --git a/lib/kamal/commands/app/logging.rb b/lib/kamal/commands/app/logging.rb index 7e760512..8acb49e9 100644 --- a/lib/kamal/commands/app/logging.rb +++ b/lib/kamal/commands/app/logging.rb @@ -1,7 +1,7 @@ module Kamal::Commands::App::Logging - def logs(since: nil, lines: nil, grep: nil) + def logs(version: nil, since: nil, lines: nil, grep: nil) pipe \ - current_running_container_id, + version ? container_id_for_version(version) : current_running_container_id, "xargs docker logs#{" --since #{since}" if since}#{" --tail #{lines}" if lines} 2>&1", ("grep '#{grep}'" if grep) end diff --git a/lib/kamal/commands/base.rb b/lib/kamal/commands/base.rb index a98d6330..2173d064 100644 --- a/lib/kamal/commands/base.rb +++ b/lib/kamal/commands/base.rb @@ -3,7 +3,6 @@ module Kamal::Commands delegate :sensitive, :argumentize, to: Kamal::Utils DOCKER_HEALTH_STATUS_FORMAT = "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'" - DOCKER_HEALTH_LOG_FORMAT = "'{{json .State.Health}}'" attr_accessor :config diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index ced41317..3d6061b5 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -162,8 +162,8 @@ class CliAppTest < CliTestCase run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| assert_match "Waiting at web barrier (1.1.1.3)...", output assert_match "Waiting at web barrier (1.1.1.4)...", output - assert_match "Barrier closed, shutting down new container... (1.1.1.3)", output - assert_match "Barrier closed, shutting down new container... (1.1.1.4)", output + assert_match "Barrier closed, shutting down new container (1.1.1.3)...", output + assert_match "Barrier closed, shutting down new container (1.1.1.4)...", output assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.3", output diff --git a/test/integration/broken_deploy_test.rb b/test/integration/broken_deploy_test.rb index 06a5e3c6..efa90fc9 100644 --- a/test/integration/broken_deploy_test.rb +++ b/test/integration/broken_deploy_test.rb @@ -15,10 +15,20 @@ class BrokenDeployTest < IntegrationTest second_version = break_app - kamal :deploy, raise_on_error: false + output = kamal :deploy, raise_on_error: false, capture: true + assert_failed_deploy output assert_app_is_up version: first_version assert_container_running host: :vm3, name: "app-workers-#{first_version}" assert_container_not_running host: :vm3, name: "app-workers-#{second_version}" end + + private + def assert_failed_deploy(output) + assert_match "Waiting at web barrier (vm3)...", output + assert_match /Deploy failed, so closed barrier \(vm[12]\)/, output + assert_match "Barrier closed, shutting down new container (vm3)...", output + assert_match "nginx: [emerg] unexpected end of file, expecting \";\" or \"}\" in /etc/nginx/conf.d/default.conf:2", output + assert_match 'ERROR {"Status":"unhealthy","FailingStreak":0,"Log":[]}', output + end end From bb2ca81d87e6860e47cd9f7539cb0db2fd083e45 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 20 May 2024 11:26:22 +0100 Subject: [PATCH 5/9] Fix rebase method duplication --- lib/kamal/cli/app/boot.rb | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index e9b167fe..a91c6ffa 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -22,18 +22,6 @@ class Kamal::Cli::App::Boot end private - def app - @app ||= KAMAL.app(role: role, host: host) - end - - def auditor - @auditor = KAMAL.auditor(role: role) - end - - def audit(message) - execute *auditor.record(message), verbosity: :debug - end - def old_version_renamed_if_clashing if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present? renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}" @@ -105,7 +93,7 @@ class Kamal::Cli::App::Boot end def app - @app ||= KAMAL.app(role: role) + @app ||= KAMAL.app(role: role, host: host) end def auditor From ee758d951af5c513a9b1b272dc39393a048936cd Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 20 May 2024 12:17:01 +0100 Subject: [PATCH 6/9] Only use barrier when needed, more descriptive info --- lib/kamal/cli/app.rb | 2 +- lib/kamal/cli/app/boot.rb | 12 ++++++------ test/cli/app_test.rb | 16 ++++++++-------- test/integration/broken_deploy_test.rb | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index 356b09f8..14cf54ad 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -15,7 +15,7 @@ class Kamal::Cli::App < Kamal::Cli::Base end #  Primary hosts and roles are returned first, so they can open the barrier - barrier = Kamal::Cli::Healthcheck::Barrier.new + barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many? on(KAMAL.hosts, **KAMAL.boot_strategy) do |host| KAMAL.roles_on(host).each do |role| diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index a91c6ffa..5100e3cc 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -43,8 +43,8 @@ class Kamal::Cli::App::Boot reach_barrier rescue => e - if barrier_role? && barrier.close - info "Deploy failed, so closed barrier (#{host})" + if barrier_role? && barrier&.close + info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})" error capture_with_info(*app.logs(version: version)) error capture_with_info(*app.container_health_log(version: version)) end @@ -71,7 +71,7 @@ class Kamal::Cli::App::Boot if barrier if barrier_role? if barrier.open - info "Opened barrier (#{host})" + info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" end else wait_for_barrier @@ -80,11 +80,11 @@ class Kamal::Cli::App::Boot end def wait_for_barrier - info "Waiting at web barrier (#{host})..." + info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..." barrier.wait - info "Barrier opened (#{host})" + info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})" rescue Kamal::Cli::Healthcheck::Error - info "Barrier closed, shutting down new container (#{host})..." + info "First #{KAMAL.primary_role} container is unhealthy, stopping (#{host})" raise end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index 3d6061b5..b5b03e27 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -136,10 +136,10 @@ class CliAppTest < CliTestCase .returns("running").at_least_once # workers health check run_command("boot", config: :with_roles, host: nil).tap do |output| - assert_match "Waiting at web barrier (1.1.1.3)...", output - assert_match "Waiting at web barrier (1.1.1.4)...", output - assert_match "Barrier opened (1.1.1.3)", output - assert_match "Barrier opened (1.1.1.4)", output + assert_match "Waiting for a healthy web container (1.1.1.3)...", output + assert_match "Waiting for a healthy web container (1.1.1.4)...", output + assert_match "First web container is healthy, continuing (1.1.1.3)", output + assert_match "First web container is healthy, continuing (1.1.1.4)", output end end @@ -160,10 +160,10 @@ class CliAppTest < CliTestCase stderred do run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| - assert_match "Waiting at web barrier (1.1.1.3)...", output - assert_match "Waiting at web barrier (1.1.1.4)...", output - assert_match "Barrier closed, shutting down new container (1.1.1.3)...", output - assert_match "Barrier closed, shutting down new container (1.1.1.4)...", output + assert_match "Waiting for a healthy web container (1.1.1.3)...", output + assert_match "Waiting for a healthy web container (1.1.1.4)...", output + assert_match "First web container is unhealthy, stopping (1.1.1.3)", output + assert_match "First web container is unhealthy, stopping (1.1.1.4)", output assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.3", output diff --git a/test/integration/broken_deploy_test.rb b/test/integration/broken_deploy_test.rb index efa90fc9..6b71844e 100644 --- a/test/integration/broken_deploy_test.rb +++ b/test/integration/broken_deploy_test.rb @@ -25,9 +25,9 @@ class BrokenDeployTest < IntegrationTest private def assert_failed_deploy(output) - assert_match "Waiting at web barrier (vm3)...", output - assert_match /Deploy failed, so closed barrier \(vm[12]\)/, output - assert_match "Barrier closed, shutting down new container (vm3)...", output + assert_match "Waiting for a healthy web container (vm3)...", output + assert_match /First #{KAMAL.primary_role} container is unhealthy, stopping \(vm[12]\)/, output + assert_match "First #{KAMAL.primary_role} container unhealthy, stopping other roles (vm3)...", output assert_match "nginx: [emerg] unexpected end of file, expecting \";\" or \"}\" in /etc/nginx/conf.d/default.conf:2", output assert_match 'ERROR {"Status":"unhealthy","FailingStreak":0,"Log":[]}', output end From 78c0a0ba4b2093c2062a4b9336facece5da22f2e Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Tue, 21 May 2024 08:33:49 +0100 Subject: [PATCH 7/9] Don't start other roles we have a healthy container If a primary role container is unhealthy, we might take a while to timeout the health check poller. In the meantime if we have started the other roles, they'll be running tow containers. This could be a problem, especially if they read run jobs as that doubles the worker capacity which could cause exessive load. We'll wait for the first primary role container to boot successfully before starting the other containers from other roles. --- lib/kamal/cli/app/boot.rb | 41 ++++++++++++++++++++++++--------------- test/cli/app_test.rb | 4 ---- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index 5100e3cc..47f8c21c 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -34,6 +34,8 @@ class Kamal::Cli::App::Boot end def start_new_version + wait_at_barrier if queuer? + audit "Booted app version #{version}" execute *app.tie_cord(role.cord_host_file) if uses_cord? @@ -41,13 +43,10 @@ class Kamal::Cli::App::Boot execute *app.run(hostname: hostname) Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } - reach_barrier + release_barrier if gatekeeper? rescue => e - if barrier_role? && barrier&.close - info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})" - error capture_with_info(*app.logs(version: version)) - error capture_with_info(*app.container_health_log(version: version)) - end + close_barrier if gatekeeper? + execute *app.stop(version: version), raise_on_non_zero_exit: false raise @@ -67,19 +66,13 @@ class Kamal::Cli::App::Boot execute *app.clean_up_assets if assets? end - def reach_barrier - if barrier - if barrier_role? - if barrier.open - info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" - end - else - wait_for_barrier - end + def release_barrier + if barrier.open + info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" end end - def wait_for_barrier + def wait_at_barrier info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..." barrier.wait info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})" @@ -88,6 +81,14 @@ class Kamal::Cli::App::Boot raise end + def close_barrier + if barrier.close + info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})" + error capture_with_info(*app.logs(version: version)) + error capture_with_info(*app.container_health_log(version: version)) + end + end + def barrier_role? role == KAMAL.primary_role end @@ -103,4 +104,12 @@ class Kamal::Cli::App::Boot def audit(message) execute *auditor.record(message), verbosity: :debug end + + def gatekeeper? + barrier && barrier_role? + end + + def queuer? + barrier && !barrier_role? + end end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index b5b03e27..f684deb8 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -154,10 +154,6 @@ class CliAppTest < CliTestCase .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") .returns("unhealthy").at_least_once # web health check failing - SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") - .returns("running").at_least_once # workers health check passing - stderred do run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| assert_match "Waiting for a healthy web container (1.1.1.3)...", output From fa7e941648ce78e2ee46cb4e9b567078072557d6 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Tue, 21 May 2024 09:31:08 +0100 Subject: [PATCH 8/9] Make SSHKit::Runner::Parallel fail slow Using a different SSHKit runner doesn't work well, because the group runner uses the Parallel runner internally. So instead we'll patch its behaviour to fail slow. We'll also get it to return all the errors so we can report on all the hosts that failed. --- bin/kamal | 9 ++++++ lib/kamal/commander.rb | 1 - lib/kamal/sshkit_with_ext.rb | 59 ++++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 24 deletions(-) diff --git a/bin/kamal b/bin/kamal index 96c0dc64..a8e2ac2b 100755 --- a/bin/kamal +++ b/bin/kamal @@ -7,6 +7,15 @@ require "kamal" begin Kamal::Cli::Main.start(ARGV) +rescue SSHKit::Runner::MultipleExecuteError => e + e.execute_errors.each do |execute_error| + puts " \e[31mERROR (#{execute_error.cause.class}): #{execute_error.message}\e[0m" + end + if ENV["VERBOSE"] + puts "Backtrace for the first error:" + puts e.execute_errors.first.cause.backtrace + end + exit 1 rescue SSHKit::Runner::ExecuteError => e puts " \e[31mERROR (#{e.cause.class}): #{e.message}\e[0m" puts e.cause.backtrace if ENV["VERBOSE"] diff --git a/lib/kamal/commander.rb b/lib/kamal/commander.rb index f4c54215..e7c5d21f 100644 --- a/lib/kamal/commander.rb +++ b/lib/kamal/commander.rb @@ -150,7 +150,6 @@ class Kamal::Commander sshkit.max_concurrent_starts = config.sshkit.max_concurrent_starts sshkit.ssh_options = config.ssh.options end - SSHKit.config.default_runner = SSHKit::Runner::ParallelCompleteAll SSHKit.config.command_map[:docker] = "docker" # No need to use /usr/bin/env, just clogs up the logs SSHKit.config.output_verbosity = verbosity end diff --git a/lib/kamal/sshkit_with_ext.rb b/lib/kamal/sshkit_with_ext.rb index c556774b..373ae843 100644 --- a/lib/kamal/sshkit_with_ext.rb +++ b/lib/kamal/sshkit_with_ext.rb @@ -104,33 +104,46 @@ class SSHKit::Backend::Netssh prepend LimitConcurrentStartsInstance end -require "thread" +class SSHKit::Runner::MultipleExecuteError < SSHKit::StandardError + attr_reader :execute_errors -module SSHKit - module Runner - class ParallelCompleteAll < Abstract - def execute - threads = hosts.map do |host| - Thread.new(host) do |h| - begin - backend(h, &block).run - rescue ::StandardError => e - e2 = SSHKit::Runner::ExecuteError.new e - raise e2, "Exception while executing #{host.user ? "as #{host.user}@" : "on host "}#{host}: #{e.message}" - end - end - end + def initialize(execute_errors) + @execute_errors = execute_errors + end +end - exception = nil - threads.each do |t| - begin - t.join - rescue SSHKit::Runner::ExecuteError => e - exception ||= e - end +class SSHKit::Runner::Parallel + # SSHKit joins the threads in sequence and fails on the first error it encounters, which means that we wait threads + # before the first failure to complete but not for ones after. + # + # We'll patch it to wait for them all to complete, and to record all the threads that errored so we can see when a + # problem occurs on multiple hosts. + module CompleteAll + def execute + threads = hosts.map do |host| + Thread.new(host) do |h| + backend(h, &block).run + rescue ::StandardError => e + e2 = SSHKit::Runner::ExecuteError.new e + raise e2, "Exception while executing #{host.user ? "as #{host.user}@" : "on host "}#{host}: #{e.message}" end - raise exception if exception + end + + exceptions = [] + threads.each do |t| + begin + t.join + rescue SSHKit::Runner::ExecuteError => e + exceptions << e + end + end + if exceptions.one? + raise exceptions.first + elsif exceptions.many? + raise SSHKit::Runner::MultipleExecuteError.new(exceptions) end end end + + prepend CompleteAll end From 706b82baa164dec68d1a88c22e9db9d73e394763 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Tue, 21 May 2024 10:40:01 +0100 Subject: [PATCH 9/9] Simplify messages and remove multiple execute error --- bin/kamal | 9 -------- lib/kamal/cli/app/boot.rb | 32 +++++++++++++++----------- lib/kamal/cli/main.rb | 2 +- lib/kamal/sshkit_with_ext.rb | 10 +------- test/cli/app_test.rb | 20 ++++++++-------- test/integration/broken_deploy_test.rb | 6 ++--- 6 files changed, 32 insertions(+), 47 deletions(-) diff --git a/bin/kamal b/bin/kamal index a8e2ac2b..96c0dc64 100755 --- a/bin/kamal +++ b/bin/kamal @@ -7,15 +7,6 @@ require "kamal" begin Kamal::Cli::Main.start(ARGV) -rescue SSHKit::Runner::MultipleExecuteError => e - e.execute_errors.each do |execute_error| - puts " \e[31mERROR (#{execute_error.cause.class}): #{execute_error.message}\e[0m" - end - if ENV["VERBOSE"] - puts "Backtrace for the first error:" - puts e.execute_errors.first.cause.backtrace - end - exit 1 rescue SSHKit::Runner::ExecuteError => e puts " \e[31mERROR (#{e.cause.class}): #{e.message}\e[0m" puts e.cause.backtrace if ENV["VERBOSE"] diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index 47f8c21c..c4125771 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -14,7 +14,17 @@ class Kamal::Cli::App::Boot def run old_version = old_version_renamed_if_clashing - start_new_version + wait_at_barrier if queuer? + + begin + start_new_version + rescue => e + close_barrier if gatekeeper? + stop_new_version + raise + end + + release_barrier if gatekeeper? if old_version stop_old_version(old_version) @@ -34,22 +44,16 @@ class Kamal::Cli::App::Boot end def start_new_version - wait_at_barrier if queuer? - audit "Booted app version #{version}" execute *app.tie_cord(role.cord_host_file) if uses_cord? hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}" execute *app.run(hostname: hostname) Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } + end - release_barrier if gatekeeper? - rescue => e - close_barrier if gatekeeper? - + def stop_new_version execute *app.stop(version: version), raise_on_non_zero_exit: false - - raise end def stop_old_version(version) @@ -68,22 +72,22 @@ class Kamal::Cli::App::Boot def release_barrier if barrier.open - info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})" + info "First #{KAMAL.primary_role} container is healthy on #{host}, booting other roles" end end def wait_at_barrier - info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..." + info "Waiting for the first healthy #{KAMAL.primary_role} container before booting #{role} on #{host}..." barrier.wait - info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})" + info "First #{KAMAL.primary_role} container is healthy, booting #{role} on #{host}..." rescue Kamal::Cli::Healthcheck::Error - info "First #{KAMAL.primary_role} container is unhealthy, stopping (#{host})" + info "First #{KAMAL.primary_role} container is unhealthy, not booting #{role} on #{host}" raise end def close_barrier if barrier.close - info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})" + info "First #{KAMAL.primary_role} container is unhealthy on #{host}, not booting other roles" error capture_with_info(*app.logs(version: version)) error capture_with_info(*app.container_health_log(version: version)) end diff --git a/lib/kamal/cli/main.rb b/lib/kamal/cli/main.rb index 594fb3fd..c05c1503 100644 --- a/lib/kamal/cli/main.rb +++ b/lib/kamal/cli/main.rb @@ -244,7 +244,7 @@ class Kamal::Cli::Main < Kamal::Cli::Base raise "Container not found" unless container_id.present? end end - rescue SSHKit::Runner::ExecuteError => e + rescue SSHKit::Runner::ExecuteError, SSHKit::Runner::MultipleExecuteError => e if e.message =~ /Container not found/ say "Error looking for container version #{version}: #{e.message}" return false diff --git a/lib/kamal/sshkit_with_ext.rb b/lib/kamal/sshkit_with_ext.rb index 373ae843..2d0257a8 100644 --- a/lib/kamal/sshkit_with_ext.rb +++ b/lib/kamal/sshkit_with_ext.rb @@ -104,14 +104,6 @@ class SSHKit::Backend::Netssh prepend LimitConcurrentStartsInstance end -class SSHKit::Runner::MultipleExecuteError < SSHKit::StandardError - attr_reader :execute_errors - - def initialize(execute_errors) - @execute_errors = execute_errors - end -end - class SSHKit::Runner::Parallel # SSHKit joins the threads in sequence and fails on the first error it encounters, which means that we wait threads # before the first failure to complete but not for ones after. @@ -140,7 +132,7 @@ class SSHKit::Runner::Parallel if exceptions.one? raise exceptions.first elsif exceptions.many? - raise SSHKit::Runner::MultipleExecuteError.new(exceptions) + raise exceptions.first, [ "Exceptions on #{exceptions.count} hosts:", exceptions.map(&:message) ].join("\n") end end end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index f684deb8..7a2a266e 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -136,11 +136,11 @@ class CliAppTest < CliTestCase .returns("running").at_least_once # workers health check run_command("boot", config: :with_roles, host: nil).tap do |output| - assert_match "Waiting for a healthy web container (1.1.1.3)...", output - assert_match "Waiting for a healthy web container (1.1.1.4)...", output - assert_match "First web container is healthy, continuing (1.1.1.3)", output - assert_match "First web container is healthy, continuing (1.1.1.4)", output - end + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.3...", output + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output + assert_match "First web container is healthy, booting workers on 1.1.1.3", output + assert_match "First web container is healthy, booting workers on 1.1.1.4", output + end end test "boot with web barrier closed" do @@ -156,14 +156,12 @@ class CliAppTest < CliTestCase stderred do run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| - assert_match "Waiting for a healthy web container (1.1.1.3)...", output - assert_match "Waiting for a healthy web container (1.1.1.4)...", output - assert_match "First web container is unhealthy, stopping (1.1.1.3)", output - assert_match "First web container is unhealthy, stopping (1.1.1.4)", output + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.3...", output + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output + assert_match "First web container is unhealthy, not booting workers on 1.1.1.3", output + assert_match "First web container is unhealthy, not booting workers on 1.1.1.4", output assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output - assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.3", output - assert_match "Running docker container ls --all --filter name=^app-workers-latest$ --quiet | xargs docker stop on 1.1.1.4", output end end ensure diff --git a/test/integration/broken_deploy_test.rb b/test/integration/broken_deploy_test.rb index 6b71844e..a3f74d45 100644 --- a/test/integration/broken_deploy_test.rb +++ b/test/integration/broken_deploy_test.rb @@ -25,9 +25,9 @@ class BrokenDeployTest < IntegrationTest private def assert_failed_deploy(output) - assert_match "Waiting for a healthy web container (vm3)...", output - assert_match /First #{KAMAL.primary_role} container is unhealthy, stopping \(vm[12]\)/, output - assert_match "First #{KAMAL.primary_role} container unhealthy, stopping other roles (vm3)...", output + assert_match "Waiting for the first healthy web container before booting workers on vm3...", output + assert_match /First web container is unhealthy on vm[12], not booting other roles/, output + assert_match "First web container is unhealthy, not booting workers on vm3", output assert_match "nginx: [emerg] unexpected end of file, expecting \";\" or \"}\" in /etc/nginx/conf.d/default.conf:2", output assert_match 'ERROR {"Status":"unhealthy","FailingStreak":0,"Log":[]}', output end