diff --git a/lib/kamal/cli/app.rb b/lib/kamal/cli/app.rb index 040ab360..523eb487 100644 --- a/lib/kamal/cli/app.rb +++ b/lib/kamal/cli/app.rb @@ -14,12 +14,16 @@ class Kamal::Cli::App < Kamal::Cli::Base end end + #  Primary hosts and roles are returned first, so they can open the barrier + barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many? + on(KAMAL.hosts, **KAMAL.boot_strategy) do |host| KAMAL.roles_on(host).each do |role| - Kamal::Cli::App::Boot.new(host, role, version, self).run + Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run end end + #  Tag once the app booted on all hosts on(KAMAL.hosts) do |host| execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug execute *KAMAL.app.tag_latest_image diff --git a/lib/kamal/cli/app/boot.rb b/lib/kamal/cli/app/boot.rb index 78670104..c4125771 100644 --- a/lib/kamal/cli/app/boot.rb +++ b/lib/kamal/cli/app/boot.rb @@ -1,19 +1,30 @@ class Kamal::Cli::App::Boot - attr_reader :host, :role, :version, :sshkit - delegate :execute, :capture_with_info, :info, to: :sshkit - delegate :uses_cord?, :assets?, to: :role + attr_reader :host, :role, :version, :barrier, :sshkit + delegate :execute, :capture_with_info, :capture_with_pretty_json, :info, :error, to: :sshkit + delegate :uses_cord?, :assets?, :running_traefik?, to: :role - def initialize(host, role, version, sshkit) + def initialize(host, role, sshkit, version, barrier) @host = host @role = role @version = version + @barrier = barrier @sshkit = sshkit end def run old_version = old_version_renamed_if_clashing - start_new_version + wait_at_barrier if queuer? + + begin + start_new_version + rescue => e + close_barrier if gatekeeper? + stop_new_version + raise + end + + release_barrier if gatekeeper? if old_version stop_old_version(old_version) @@ -21,18 +32,6 @@ class Kamal::Cli::App::Boot end private - def app - @app ||= KAMAL.app(role: role, host: host) - end - - def auditor - @auditor = KAMAL.auditor(role: role) - end - - def audit(message) - execute *auditor.record(message), verbosity: :debug - end - def old_version_renamed_if_clashing if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present? renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}" @@ -46,12 +45,17 @@ class Kamal::Cli::App::Boot def start_new_version audit "Booted app version #{version}" + execute *app.tie_cord(role.cord_host_file) if uses_cord? hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}" execute *app.run(hostname: hostname) Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) } end + def stop_new_version + execute *app.stop(version: version), raise_on_non_zero_exit: false + end + def stop_old_version(version) if uses_cord? cord = capture_with_info(*app.cord(version: version), raise_on_non_zero_exit: false).strip @@ -65,4 +69,51 @@ class Kamal::Cli::App::Boot execute *app.clean_up_assets if assets? end + + def release_barrier + if barrier.open + info "First #{KAMAL.primary_role} container is healthy on #{host}, booting other roles" + end + end + + def wait_at_barrier + info "Waiting for the first healthy #{KAMAL.primary_role} container before booting #{role} on #{host}..." + barrier.wait + info "First #{KAMAL.primary_role} container is healthy, booting #{role} on #{host}..." + rescue Kamal::Cli::Healthcheck::Error + info "First #{KAMAL.primary_role} container is unhealthy, not booting #{role} on #{host}" + raise + end + + def close_barrier + if barrier.close + info "First #{KAMAL.primary_role} container is unhealthy on #{host}, not booting other roles" + error capture_with_info(*app.logs(version: version)) + error capture_with_info(*app.container_health_log(version: version)) + end + end + + def barrier_role? + role == KAMAL.primary_role + end + + def app + @app ||= KAMAL.app(role: role, host: host) + end + + def auditor + @auditor = KAMAL.auditor(role: role) + end + + def audit(message) + execute *auditor.record(message), verbosity: :debug + end + + def gatekeeper? + barrier && barrier_role? + end + + def queuer? + barrier && !barrier_role? + end end diff --git a/lib/kamal/cli/healthcheck.rb b/lib/kamal/cli/healthcheck.rb deleted file mode 100644 index b3dbf6bd..00000000 --- a/lib/kamal/cli/healthcheck.rb +++ /dev/null @@ -1,21 +0,0 @@ -class Kamal::Cli::Healthcheck < Kamal::Cli::Base - default_command :perform - - desc "perform", "Health check current app version" - def perform - raise "The primary host is not configured to run Traefik" unless KAMAL.config.role(KAMAL.config.primary_role).running_traefik? - on(KAMAL.primary_host) do - begin - execute *KAMAL.healthcheck.run - Poller.wait_for_healthy { capture_with_info(*KAMAL.healthcheck.status) } - rescue Poller::HealthcheckError => e - error capture_with_info(*KAMAL.healthcheck.logs) - error capture_with_pretty_json(*KAMAL.healthcheck.container_health_log) - raise - ensure - execute *KAMAL.healthcheck.stop, raise_on_non_zero_exit: false - execute *KAMAL.healthcheck.remove, raise_on_non_zero_exit: false - end - end - end -end diff --git a/lib/kamal/cli/healthcheck/barrier.rb b/lib/kamal/cli/healthcheck/barrier.rb new file mode 100644 index 00000000..0fbfb511 --- /dev/null +++ b/lib/kamal/cli/healthcheck/barrier.rb @@ -0,0 +1,31 @@ +class Kamal::Cli::Healthcheck::Barrier + def initialize + @ivar = Concurrent::IVar.new + end + + def close + set(false) + end + + def open + set(true) + end + + def wait + unless opened? + raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier") + end + end + + private + def opened? + @ivar.value + end + + def set(value) + @ivar.set(value) + true + rescue Concurrent::MultipleAssignmentError + false + end +end diff --git a/lib/kamal/cli/healthcheck/error.rb b/lib/kamal/cli/healthcheck/error.rb new file mode 100644 index 00000000..8824a72e --- /dev/null +++ b/lib/kamal/cli/healthcheck/error.rb @@ -0,0 +1,2 @@ +class Kamal::Cli::Healthcheck::Error < StandardError +end diff --git a/lib/kamal/cli/healthcheck/poller.rb b/lib/kamal/cli/healthcheck/poller.rb index 9d91adfc..06898b1c 100644 --- a/lib/kamal/cli/healthcheck/poller.rb +++ b/lib/kamal/cli/healthcheck/poller.rb @@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller TRAEFIK_UPDATE_DELAY = 5 - class HealthcheckError < StandardError; end def wait_for_healthy(pause_after_ready: false, &block) attempt = 1 @@ -16,9 +15,9 @@ module Kamal::Cli::Healthcheck::Poller when "running" # No health check configured sleep KAMAL.config.readiness_delay if pause_after_ready else - raise HealthcheckError, "container not ready (#{status})" + raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})" end - rescue HealthcheckError => e + rescue Kamal::Cli::Healthcheck::Error => e if attempt <= max_attempts info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..." sleep attempt @@ -41,9 +40,9 @@ module Kamal::Cli::Healthcheck::Poller when "unhealthy" sleep TRAEFIK_UPDATE_DELAY if pause_after_ready else - raise HealthcheckError, "container not unhealthy (#{status})" + raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})" end - rescue HealthcheckError => e + rescue Kamal::Cli::Healthcheck::Error => e if attempt <= max_attempts info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..." sleep attempt diff --git a/lib/kamal/cli/main.rb b/lib/kamal/cli/main.rb index a09724b6..f17fd6cd 100644 --- a/lib/kamal/cli/main.rb +++ b/lib/kamal/cli/main.rb @@ -41,11 +41,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base say "Ensure Traefik is running...", :magenta invoke "kamal:cli:traefik:boot", [], invoke_options - if KAMAL.config.role(KAMAL.config.primary_role).running_traefik? - say "Ensure app can pass healthcheck...", :magenta - invoke "kamal:cli:healthcheck:perform", [], invoke_options - end - say "Detect stale containers...", :magenta invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true) @@ -76,9 +71,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base run_hook "pre-deploy" - say "Ensure app can pass healthcheck...", :magenta - invoke "kamal:cli:healthcheck:perform", [], invoke_options - say "Detect stale containers...", :magenta invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true) @@ -227,9 +219,6 @@ class Kamal::Cli::Main < Kamal::Cli::Base desc "env", "Manage environment files" subcommand "env", Kamal::Cli::Env - desc "healthcheck", "Healthcheck application" - subcommand "healthcheck", Kamal::Cli::Healthcheck - desc "lock", "Manage the deploy lock" subcommand "lock", Kamal::Cli::Lock @@ -254,7 +243,7 @@ class Kamal::Cli::Main < Kamal::Cli::Base raise "Container not found" unless container_id.present? end end - rescue SSHKit::Runner::ExecuteError => e + rescue SSHKit::Runner::ExecuteError, SSHKit::Runner::MultipleExecuteError => e if e.message =~ /Container not found/ say "Error looking for container version #{version}: #{e.message}" return false diff --git a/lib/kamal/commands/app/containers.rb b/lib/kamal/commands/app/containers.rb index a62d9a35..0bab388b 100644 --- a/lib/kamal/commands/app/containers.rb +++ b/lib/kamal/commands/app/containers.rb @@ -1,4 +1,6 @@ module Kamal::Commands::App::Containers + DOCKER_HEALTH_LOG_FORMAT = "'{{json .State.Health}}'" + def list_containers docker :container, :ls, "--all", *filter_args end @@ -20,4 +22,10 @@ module Kamal::Commands::App::Containers def remove_containers docker :container, :prune, "--force", *filter_args end + + def container_health_log(version:) + pipe \ + container_id_for(container_name: container_name(version)), + xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT)) + end end diff --git a/lib/kamal/commands/app/logging.rb b/lib/kamal/commands/app/logging.rb index 7e760512..8acb49e9 100644 --- a/lib/kamal/commands/app/logging.rb +++ b/lib/kamal/commands/app/logging.rb @@ -1,7 +1,7 @@ module Kamal::Commands::App::Logging - def logs(since: nil, lines: nil, grep: nil) + def logs(version: nil, since: nil, lines: nil, grep: nil) pipe \ - current_running_container_id, + version ? container_id_for_version(version) : current_running_container_id, "xargs docker logs#{" --since #{since}" if since}#{" --tail #{lines}" if lines} 2>&1", ("grep '#{grep}'" if grep) end diff --git a/lib/kamal/commands/base.rb b/lib/kamal/commands/base.rb index a98d6330..2173d064 100644 --- a/lib/kamal/commands/base.rb +++ b/lib/kamal/commands/base.rb @@ -3,7 +3,6 @@ module Kamal::Commands delegate :sensitive, :argumentize, to: Kamal::Utils DOCKER_HEALTH_STATUS_FORMAT = "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'" - DOCKER_HEALTH_LOG_FORMAT = "'{{json .State.Health}}'" attr_accessor :config diff --git a/lib/kamal/commands/healthcheck.rb b/lib/kamal/commands/healthcheck.rb deleted file mode 100644 index 2517a5e2..00000000 --- a/lib/kamal/commands/healthcheck.rb +++ /dev/null @@ -1,59 +0,0 @@ -class Kamal::Commands::Healthcheck < Kamal::Commands::Base - def run - primary = config.role(config.primary_role) - - docker :run, - "--detach", - "--name", container_name_with_version, - "--publish", "#{exposed_port}:#{config.healthcheck["port"]}", - "--label", "service=#{config.healthcheck_service}", - "-e", "KAMAL_CONTAINER_NAME=\"#{config.healthcheck_service}\"", - *primary.env_args(config.primary_host), - *primary.health_check_args(cord: false), - *config.volume_args, - *primary.option_args, - config.absolute_image, - primary.cmd - end - - def status - pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT)) - end - - def container_health_log - pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_LOG_FORMAT)) - end - - def logs - pipe container_id, xargs(docker(:logs, "--tail", log_lines, "2>&1")) - end - - def stop - pipe container_id, xargs(docker(:stop)) - end - - def remove - pipe container_id, xargs(docker(:container, :rm)) - end - - private - def container_name_with_version - "#{config.healthcheck_service}-#{config.version}" - end - - def container_id - container_id_for(container_name: container_name_with_version) - end - - def health_url - "http://localhost:#{exposed_port}#{config.healthcheck["path"]}" - end - - def exposed_port - config.healthcheck["exposed_port"] - end - - def log_lines - config.healthcheck["log_lines"] - end -end diff --git a/lib/kamal/configuration.rb b/lib/kamal/configuration.rb index 4957be17..bed91cf6 100644 --- a/lib/kamal/configuration.rb +++ b/lib/kamal/configuration.rb @@ -188,7 +188,7 @@ class Kamal::Configuration def healthcheck - { "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {}) + { "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {}) end def healthcheck_service diff --git a/lib/kamal/sshkit_with_ext.rb b/lib/kamal/sshkit_with_ext.rb index e0c62c3a..2d0257a8 100644 --- a/lib/kamal/sshkit_with_ext.rb +++ b/lib/kamal/sshkit_with_ext.rb @@ -103,3 +103,39 @@ class SSHKit::Backend::Netssh prepend LimitConcurrentStartsInstance end + +class SSHKit::Runner::Parallel + # SSHKit joins the threads in sequence and fails on the first error it encounters, which means that we wait threads + # before the first failure to complete but not for ones after. + # + # We'll patch it to wait for them all to complete, and to record all the threads that errored so we can see when a + # problem occurs on multiple hosts. + module CompleteAll + def execute + threads = hosts.map do |host| + Thread.new(host) do |h| + backend(h, &block).run + rescue ::StandardError => e + e2 = SSHKit::Runner::ExecuteError.new e + raise e2, "Exception while executing #{host.user ? "as #{host.user}@" : "on host "}#{host}: #{e.message}" + end + end + + exceptions = [] + threads.each do |t| + begin + t.join + rescue SSHKit::Runner::ExecuteError => e + exceptions << e + end + end + if exceptions.one? + raise exceptions.first + elsif exceptions.many? + raise exceptions.first, [ "Exceptions on #{exceptions.count} hosts:", exceptions.map(&:message) ].join("\n") + end + end + end + + prepend CompleteAll +end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index c75a86cc..7a2a266e 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -118,6 +118,56 @@ class CliAppTest < CliTestCase end end + test "boot with web barrier opened" do + Object.any_instance.stubs(:sleep) + + SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("running").at_least_once # web health check passing + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("unhealthy").at_least_once # web health check failing + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("running").at_least_once # workers health check + + run_command("boot", config: :with_roles, host: nil).tap do |output| + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.3...", output + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output + assert_match "First web container is healthy, booting workers on 1.1.1.3", output + assert_match "First web container is healthy, booting workers on 1.1.1.4", output + end + end + + test "boot with web barrier closed" do + Thread.report_on_exception = false + + Object.any_instance.stubs(:sleep) + + SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version + + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") + .returns("unhealthy").at_least_once # web health check failing + + stderred do + run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output| + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.3...", output + assert_match "Waiting for the first healthy web container before booting workers on 1.1.1.4...", output + assert_match "First web container is unhealthy, not booting workers on 1.1.1.3", output + assert_match "First web container is unhealthy, not booting workers on 1.1.1.4", output + assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.1", output + assert_match "Running docker container ls --all --filter name=^app-web-latest$ --quiet | xargs docker stop on 1.1.1.2", output + end + end + ensure + Thread.report_on_exception = true + end + test "start" do run_command("start").tap do |output| assert_match "docker start app-web-999", output @@ -283,8 +333,12 @@ class CliAppTest < CliTestCase end private - def run_command(*command, config: :with_accessories) - stdouted { Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", "--hosts", "1.1.1.1" ]) } + def run_command(*command, config: :with_accessories, host: "1.1.1.1", allow_execute_error: false) + stdouted do + Kamal::Cli::App.start([ *command, "-c", "test/fixtures/deploy_#{config}.yml", *([ "--hosts", host ] if host) ]) + rescue SSHKit::Runner::ExecuteError => e + raise e unless allow_execute_error + end end def stub_running diff --git a/test/cli/healthcheck_test.rb b/test/cli/healthcheck_test.rb deleted file mode 100644 index e273bf2e..00000000 --- a/test/cli/healthcheck_test.rb +++ /dev/null @@ -1,82 +0,0 @@ -require_relative "cli_test_case" - -class CliHealthcheckTest < CliTestCase - test "perform" do - # Prevent expected failures from outputting to terminal - Thread.report_on_exception = false - - Kamal::Cli::Healthcheck::Poller.stubs(:sleep) # No sleeping when retrying - Kamal::Configuration.any_instance.stubs(:run_id).returns("12345678901234567890123456789012") - - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false) - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "KAMAL_CONTAINER_NAME=\"healthcheck-app\"", "--env-file", ".kamal/env/roles/app-web.env", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999") - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false) - - # Fail twice to test retry logic - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") - .returns("starting") - .then - .returns("unhealthy") - .then - .returns("healthy") - - run_command("perform").tap do |output| - assert_match "container not ready (starting), retrying in 1s (attempt 1/7)...", output - assert_match "container not ready (unhealthy), retrying in 2s (attempt 2/7)...", output - assert_match "Container is healthy!", output - end - end - - test "perform failing to become healthy" do - # Prevent expected failures from outputting to terminal - Thread.report_on_exception = false - - Kamal::Cli::Healthcheck::Poller.stubs(:sleep) # No sleeping when retrying - - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false) - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "KAMAL_CONTAINER_NAME=\"healthcheck-app\"", "--env-file", ".kamal/env/roles/app-web.env", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999") - SSHKit::Backend::Abstract.any_instance.stubs(:execute) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false) - - # Continually report unhealthy - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'") - .returns("unhealthy") - - # Capture logs when failing - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :logs, "--tail", 50, "2>&1") - .returns("some log output") - - # Capture container health log when failing - SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_pretty_json) - .with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{json .State.Health}}'") - .returns('{"Status":"unhealthy","Log":[{"ExitCode": 1,"Output": "/bin/sh: 1: curl: not found\n"}]}"') - - exception = assert_raises do - run_command("perform") - end - assert_match "container not ready (unhealthy)", exception.message - end - - test "raises an exception if primary does not have traefik" do - SSHKit::Backend::Abstract.any_instance.expects(:execute).never - - exception = assert_raises do - run_command("perform", config_file: "test/fixtures/deploy_workers_only.yml") - end - - assert_equal "The primary host is not configured to run Traefik", exception.message - end - - private - def run_command(*command, config_file: "test/fixtures/deploy_with_accessories.yml") - stdouted { Kamal::Cli::Healthcheck.start([ *command, "-c", config_file ]) } - end -end diff --git a/test/cli/main_test.rb b/test/cli/main_test.rb index 81d3eb4d..358a7394 100644 --- a/test/cli/main_test.rb +++ b/test/cli/main_test.rb @@ -25,7 +25,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -38,7 +37,6 @@ class CliMainTest < CliTestCase assert_match /Log into image registry/, output assert_match /Pull app image/, output assert_match /Ensure Traefik is running/, output - assert_match /Ensure app can pass healthcheck/, output assert_match /Detect stale containers/, output assert_match /Prune old containers and images/, output assert_match /Releasing the deploy lock/, output @@ -51,7 +49,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -65,7 +62,6 @@ class CliMainTest < CliTestCase assert_match /Build and push app image/, output assert_hook_ran "pre-deploy", output, **hook_variables assert_match /Ensure Traefik is running/, output - assert_match /Ensure app can pass healthcheck/, output assert_match /Detect stale containers/, output assert_match /Prune old containers and images/, output assert_hook_ran "post-deploy", output, **hook_variables, runtime: true @@ -78,7 +74,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -88,7 +83,6 @@ class CliMainTest < CliTestCase assert_match /Log into image registry/, output assert_match /Pull app image/, output assert_match /Ensure Traefik is running/, output - assert_match /Ensure app can pass healthcheck/, output assert_match /Detect stale containers/, output assert_match /Prune old containers and images/, output assert_match /Releasing the deploy lock/, output @@ -154,7 +148,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -185,7 +178,6 @@ class CliMainTest < CliTestCase Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:registry:login", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:traefik:boot", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:prune:all", [], invoke_options) @@ -197,7 +189,6 @@ class CliMainTest < CliTestCase invoke_options = { "config_file" => "test/fixtures/deploy_simple.yml", "version" => "999", "skip_hooks" => false, "verbose" => true } Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:deliver", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) @@ -210,7 +201,6 @@ class CliMainTest < CliTestCase assert_match /Build and push app image/, output assert_hook_ran "pre-deploy", output, **hook_variables assert_match /Running the pre-deploy hook.../, output - assert_match /Ensure app can pass healthcheck/, output assert_hook_ran "post-deploy", output, **hook_variables, runtime: true end end @@ -219,13 +209,11 @@ class CliMainTest < CliTestCase invoke_options = { "config_file" => "test/fixtures/deploy_simple.yml", "version" => "999", "skip_hooks" => false } Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:build:pull", [], invoke_options) - Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:healthcheck:perform", [], invoke_options) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)) Kamal::Cli::Main.any_instance.expects(:invoke).with("kamal:cli:app:boot", [], invoke_options) run_command("redeploy", "--skip_push").tap do |output| assert_match /Pull app image/, output - assert_match /Ensure app can pass healthcheck/, output end end diff --git a/test/commander_test.rb b/test/commander_test.rb index 2bbdb479..6a7ec536 100644 --- a/test/commander_test.rb +++ b/test/commander_test.rb @@ -99,6 +99,11 @@ class CommanderTest < ActiveSupport::TestCase assert_equal [ "workers" ], @kamal.roles_on("1.1.1.3").map(&:name) end + test "roles_on web comes first" do + configure_with(:deploy_with_two_roles_one_host) + assert_equal [ "web", "workers" ], @kamal.roles_on("1.1.1.1").map(&:name) + end + test "default group strategy" do assert_empty @kamal.boot_strategy end diff --git a/test/commands/healthcheck_test.rb b/test/commands/healthcheck_test.rb deleted file mode 100644 index 829aa0b2..00000000 --- a/test/commands/healthcheck_test.rb +++ /dev/null @@ -1,124 +0,0 @@ -require "test_helper" - -class CommandsHealthcheckTest < ActiveSupport::TestCase - setup do - @config = { - service: "app", image: "dhh/app", registry: { "username" => "dhh", "password" => "secret" }, servers: [ "1.1.1.1" ], - traefik: { "args" => { "accesslog.format" => "json", "metrics.prometheus.buckets" => "0.1,0.3,1.2,5.0" } } - } - end - - test "run" do - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with custom port" do - @config[:healthcheck] = { "port" => 3001 } - - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3001 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"curl -f http://localhost:3001/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with destination" do - @destination = "staging" - - assert_equal \ - "docker run --detach --name healthcheck-app-staging-123 --publish 3999:3000 --label service=healthcheck-app-staging -e KAMAL_CONTAINER_NAME=\"healthcheck-app-staging\" --env-file .kamal/env/roles/app-web-staging.env --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with custom healthcheck" do - @config[:healthcheck] = { "cmd" => "/bin/up" } - - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"/bin/up\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with custom options" do - @config[:servers] = { "web" => { "hosts" => [ "1.1.1.1" ], "options" => { "mount" => "somewhere" } } } - @config[:healthcheck] = { "exposed_port" => 4999 } - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 4999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --mount \"somewhere\" dhh/app:123", - new_command.run.join(" ") - end - - test "run with tags" do - @config[:servers] = [ { "1.1.1.1" => "tag1" } ] - @config[:env] = {} - @config[:env]["tags"] = { "tag1" => { "ENV1" => "value1" } } - - assert_equal \ - "docker run --detach --name healthcheck-app-123 --publish 3999:3000 --label service=healthcheck-app -e KAMAL_CONTAINER_NAME=\"healthcheck-app\" --env-file .kamal/env/roles/app-web.env --env ENV1=\"value1\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" dhh/app:123", - new_command.run.join(" ") - end - - test "status" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'", - new_command.status.join(" ") - end - - test "container_health_log" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker inspect --format '{{json .State.Health}}'", - new_command.container_health_log.join(" ") - end - - test "stop" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker stop", - new_command.stop.join(" ") - end - - test "stop with destination" do - @destination = "staging" - - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-staging-123$ --quiet | xargs docker stop", - new_command.stop.join(" ") - end - - test "remove" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker container rm", - new_command.remove.join(" ") - end - - test "remove with destination" do - @destination = "staging" - - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-staging-123$ --quiet | xargs docker container rm", - new_command.remove.join(" ") - end - - test "logs" do - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker logs --tail 50 2>&1", - new_command.logs.join(" ") - end - - test "logs with custom lines number" do - @config[:healthcheck] = { "log_lines" => 150 } - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-123$ --quiet | xargs docker logs --tail 150 2>&1", - new_command.logs.join(" ") - end - - test "logs with destination" do - @destination = "staging" - - assert_equal \ - "docker container ls --all --filter name=^healthcheck-app-staging-123$ --quiet | xargs docker logs --tail 50 2>&1", - new_command.logs.join(" ") - end - - private - def new_command - Kamal::Commands::Healthcheck.new(Kamal::Configuration.new(@config, destination: @destination, version: "123")) - end -end diff --git a/test/configuration_test.rb b/test/configuration_test.rb index 6a86b231..e7e7f151 100644 --- a/test/configuration_test.rb +++ b/test/configuration_test.rb @@ -272,7 +272,7 @@ class ConfigurationTest < ActiveSupport::TestCase volume_args: [ "--volume", "/local/path:/container/path" ], builder: {}, logging: [ "--log-opt", "max-size=\"10m\"" ], - healthcheck: { "path"=>"/up", "port"=>3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 } } + healthcheck: { "path"=>"/up", "port"=>3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 } } assert_equal expected_config, @config.to_h end diff --git a/test/fixtures/deploy_with_two_roles_one_host.yml b/test/fixtures/deploy_with_two_roles_one_host.yml new file mode 100644 index 00000000..cae05469 --- /dev/null +++ b/test/fixtures/deploy_with_two_roles_one_host.yml @@ -0,0 +1,15 @@ +service: app +image: dhh/app +servers: + workers: + hosts: + - 1.1.1.1 + web: + hosts: + - 1.1.1.1 +env: + REDIS_URL: redis://x/y +registry: + server: registry.digitalocean.com + username: user + password: pw diff --git a/test/integration/broken_deploy_test.rb b/test/integration/broken_deploy_test.rb new file mode 100644 index 00000000..a3f74d45 --- /dev/null +++ b/test/integration/broken_deploy_test.rb @@ -0,0 +1,34 @@ +require_relative "integration_test" + +class BrokenDeployTest < IntegrationTest + test "deploying a bad image" do + @app = "app_with_roles" + + kamal :envify + + first_version = latest_app_version + + kamal :deploy + + assert_app_is_up version: first_version + assert_container_running host: :vm3, name: "app-workers-#{first_version}" + + second_version = break_app + + output = kamal :deploy, raise_on_error: false, capture: true + + assert_failed_deploy output + assert_app_is_up version: first_version + assert_container_running host: :vm3, name: "app-workers-#{first_version}" + assert_container_not_running host: :vm3, name: "app-workers-#{second_version}" + end + + private + def assert_failed_deploy(output) + assert_match "Waiting for the first healthy web container before booting workers on vm3...", output + assert_match /First web container is unhealthy on vm[12], not booting other roles/, output + assert_match "First web container is unhealthy, not booting workers on vm3", output + assert_match "nginx: [emerg] unexpected end of file, expecting \";\" or \"}\" in /etc/nginx/conf.d/default.conf:2", output + assert_match 'ERROR {"Status":"unhealthy","FailingStreak":0,"Log":[]}', output + end +end diff --git a/test/integration/docker/deployer/app/config/deploy.yml b/test/integration/docker/deployer/app/config/deploy.yml index 38113737..397a49ca 100644 --- a/test/integration/docker/deployer/app/config/deploy.yml +++ b/test/integration/docker/deployer/app/config/deploy.yml @@ -28,6 +28,7 @@ builder: COMMIT_SHA: <%= `git rev-parse HEAD` %> healthcheck: cmd: wget -qO- http://localhost > /dev/null || exit 1 + max_attempts: 3 traefik: args: accesslog: true @@ -41,3 +42,4 @@ accessories: roles: - web stop_wait_time: 1 +readiness_delay: 0 diff --git a/test/integration/docker/deployer/app_with_roles/config/deploy.yml b/test/integration/docker/deployer/app_with_roles/config/deploy.yml index 004ffb25..2cf362c6 100644 --- a/test/integration/docker/deployer/app_with_roles/config/deploy.yml +++ b/test/integration/docker/deployer/app_with_roles/config/deploy.yml @@ -22,6 +22,7 @@ builder: COMMIT_SHA: <%= `git rev-parse HEAD` %> healthcheck: cmd: wget -qO- http://localhost > /dev/null || exit 1 + max_attempts: 3 traefik: args: accesslog: true @@ -35,3 +36,4 @@ accessories: roles: - web stop_wait_time: 1 +readiness_delay: 0 diff --git a/test/integration/docker/deployer/break_app.sh b/test/integration/docker/deployer/break_app.sh new file mode 100755 index 00000000..e8b19044 --- /dev/null +++ b/test/integration/docker/deployer/break_app.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +cd $1 && echo "bad nginx config" > default.conf && git commit -am 'Broken' diff --git a/test/integration/integration_test.rb b/test/integration/integration_test.rb index bd62c474..f8f54d42 100644 --- a/test/integration/integration_test.rb +++ b/test/integration/integration_test.rb @@ -78,6 +78,11 @@ class IntegrationTest < ActiveSupport::TestCase latest_app_version end + def break_app + deployer_exec "./break_app.sh #{@app}", workdir: "/" + latest_app_version + end + def latest_app_version deployer_exec("git rev-parse HEAD", capture: true) end @@ -131,4 +136,16 @@ class IntegrationTest < ActiveSupport::TestCase puts "Tried to get the response code again and got #{app_response.code}" end end + + def assert_container_running(host:, name:) + assert container_running?(host: host, name: name) + end + + def assert_container_not_running(host:, name:) + assert_not container_running?(host: host, name: name) + end + + def container_running?(host:, name:) + docker_compose("exec #{host} docker ps --filter=name=#{name} | tail -n+2", capture: true).tap { |x| p [ x, x.strip, x.strip.present? ] }.strip.present? + end end diff --git a/test/integration/main_test.rb b/test/integration/main_test.rb index a2dc8a0c..62857d4a 100644 --- a/test/integration/main_test.rb +++ b/test/integration/main_test.rb @@ -56,6 +56,12 @@ class MainTest < IntegrationTest assert_app_is_up version: version assert_hooks_ran "pre-connect", "pre-build", "pre-deploy", "post-deploy" assert_container_running host: :vm3, name: "app-workers-#{version}" + + second_version = update_app_rev + + kamal :redeploy + assert_app_is_up version: second_version + assert_container_running host: :vm3, name: "app-workers-#{second_version}" end test "config" do @@ -73,7 +79,7 @@ class MainTest < IntegrationTest assert_equal({ user: "root", port: 22, keepalive: true, keepalive_interval: 30, log_level: :fatal }, config[:ssh_options]) assert_equal({ "multiarch" => false, "args" => { "COMMIT_SHA" => version } }, config[:builder]) assert_equal [ "--log-opt", "max-size=\"10m\"" ], config[:logging] - assert_equal({ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord"=>"/tmp/kamal-cord", "log_lines" => 50, "cmd"=>"wget -qO- http://localhost > /dev/null || exit 1" }, config[:healthcheck]) + assert_equal({ "path" => "/up", "port" => 3000, "max_attempts" => 3, "cord"=>"/tmp/kamal-cord", "log_lines" => 50, "cmd"=>"wget -qO- http://localhost > /dev/null || exit 1" }, config[:healthcheck]) end test "setup and remove" do @@ -157,8 +163,4 @@ class MainTest < IntegrationTest assert vm1_image_ids.any? assert vm1_container_ids.any? end - - def assert_container_running(host:, name:) - assert docker_compose("exec #{host} docker ps --filter=name=#{name} -q", capture: true).strip.present? - end end