From f530009a6ee52c58ab165a4da14a39d93591b495 Mon Sep 17 00:00:00 2001 From: Kevin McConnell Date: Thu, 13 Apr 2023 12:43:19 +0100 Subject: [PATCH 01/17] Allow performing boot & start operations in groups Adds top-level configuration options for `group_limit` and `group_wait`. When a `group_limit` is present, we'll perform app boot & start operations on no more than `group_limit` hosts at a time, optionally sleeping for `group_wait` seconds after each batch. We currently only do this batching on boot & start operations (including when they are part of a deployment). Other commands, like `app stop` or `app details` still work on all hosts in parallel. --- README.md | 17 +++++++++++++++++ lib/mrsk/cli/app.rb | 4 ++-- lib/mrsk/configuration.rb | 17 +++++++++++++++++ test/cli/app_test.rb | 14 ++++++++++++-- test/fixtures/deploy_with_group_strategy.yml | 16 ++++++++++++++++ 5 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 test/fixtures/deploy_with_group_strategy.yml diff --git a/README.md b/README.md index 23fc8ad6..d972b298 100644 --- a/README.md +++ b/README.md @@ -831,6 +831,23 @@ mrsk lock acquire -m "Doing maintanence" mrsk lock release ``` +## Gradual restarts + +When deploying to large numbers of hosts, you might prefer not to restart your services on every host at the same time. + +MRSK's default is to start new containers on all hosts in parallel. But you can control this by configuring `group_limit` and `group_wait`. + +```yaml +service: myservice + +group_limit: 10 +group_wait: 30 +``` + +When `group_limit` is specified, containers will be started on, at most, `group_limit` hosts at once. MRSK will pause for `group_wait` seconds between batches. + +These settings only apply when starting containers (using `mrsk deploy`, `mrsk app boot` or `mrsk app start`). For other commands, MRSK continues to run commands in parallel across all hosts. + ## Stage of development This is beta software. Commands may still move around. But we're live in production at [37signals](https://37signals.com). diff --git a/lib/mrsk/cli/app.rb b/lib/mrsk/cli/app.rb index 882c80c8..b6c9e1b8 100644 --- a/lib/mrsk/cli/app.rb +++ b/lib/mrsk/cli/app.rb @@ -13,7 +13,7 @@ class Mrsk::Cli::App < Mrsk::Cli::Base execute *MRSK.app.tag_current_as_latest end - on(MRSK.hosts) do |host| + on(MRSK.hosts, **MRSK.config.group_strategy) do |host| roles = MRSK.roles_on(host) roles.each do |role| @@ -39,7 +39,7 @@ class Mrsk::Cli::App < Mrsk::Cli::Base desc "start", "Start existing app container on servers" def start with_lock do - on(MRSK.hosts) do |host| + on(MRSK.hosts, **MRSK.config.group_strategy) do |host| roles = MRSK.roles_on(host) roles.each do |role| diff --git a/lib/mrsk/configuration.rb b/lib/mrsk/configuration.rb index a2c2ecf1..388bcce4 100644 --- a/lib/mrsk/configuration.rb +++ b/lib/mrsk/configuration.rb @@ -153,6 +153,15 @@ class Mrsk::Configuration end + def group_strategy + if group_limit.present? + { in: :groups, limit: group_limit, wait: group_wait } + else + {} + end + end + + def audit_broadcast_cmd raw_config.audit_broadcast_cmd end @@ -237,4 +246,12 @@ class Mrsk::Configuration raise "Can't use commit hash as version, no git repository found in #{Dir.pwd}" end end + + def group_limit + raw_config.group_limit&.to_i + end + + def group_wait + raw_config.group_wait&.to_i + end end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index 78b166d4..f27c04ea 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -33,6 +33,16 @@ class CliAppTest < CliTestCase Thread.report_on_exception = true end + test "boot uses group strategy when specified" do + Mrsk::Cli::App.any_instance.stubs(:on).with("1.1.1.1").twice # acquire & release lock + Mrsk::Cli::App.any_instance.stubs(:on).with([ "1.1.1.1" ]) # tag container + + # Strategy is used when booting the containers + Mrsk::Cli::App.any_instance.expects(:on).with([ "1.1.1.1" ], in: :groups, limit: 3, wait: 30).with_block_given + + run_command("boot", config: :with_group_strategy) + end + test "start" do run_command("start").tap do |output| assert_match "docker start app-web-999", output @@ -151,7 +161,7 @@ class CliAppTest < CliTestCase end private - def run_command(*command) - stdouted { Mrsk::Cli::App.start([*command, "-c", "test/fixtures/deploy_with_accessories.yml", "--hosts", "1.1.1.1"]) } + def run_command(*command, config: :with_accessories) + stdouted { Mrsk::Cli::App.start([*command, "-c", "test/fixtures/deploy_#{config}.yml", "--hosts", "1.1.1.1"]) } end end diff --git a/test/fixtures/deploy_with_group_strategy.yml b/test/fixtures/deploy_with_group_strategy.yml new file mode 100644 index 00000000..bc551f8a --- /dev/null +++ b/test/fixtures/deploy_with_group_strategy.yml @@ -0,0 +1,16 @@ +service: app +image: dhh/app +servers: + web: + - "1.1.1.1" + - "1.1.1.2" + workers: + - "1.1.1.3" + - "1.1.1.4" + +registry: + username: user + password: pw + +group_limit: 3 +group_wait: 30 From 100b72e4b48d4a887746ddf0f8981b1de1096dcf Mon Sep 17 00:00:00 2001 From: Kevin McConnell Date: Fri, 14 Apr 2023 10:41:07 +0100 Subject: [PATCH 02/17] Limit rolling deployment to boot operation --- README.md | 8 ++++---- lib/mrsk/cli/app.rb | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d972b298..e3671f5c 100644 --- a/README.md +++ b/README.md @@ -831,11 +831,11 @@ mrsk lock acquire -m "Doing maintanence" mrsk lock release ``` -## Gradual restarts +## Rolling deployments When deploying to large numbers of hosts, you might prefer not to restart your services on every host at the same time. -MRSK's default is to start new containers on all hosts in parallel. But you can control this by configuring `group_limit` and `group_wait`. +MRSK's default is to boot new containers on all hosts in parallel. But you can control this by configuring `group_limit` and `group_wait`. ```yaml service: myservice @@ -844,9 +844,9 @@ group_limit: 10 group_wait: 30 ``` -When `group_limit` is specified, containers will be started on, at most, `group_limit` hosts at once. MRSK will pause for `group_wait` seconds between batches. +When `group_limit` is specified, containers will be booted on, at most, `group_limit` hosts at once. MRSK will pause for `group_wait` seconds between batches. -These settings only apply when starting containers (using `mrsk deploy`, `mrsk app boot` or `mrsk app start`). For other commands, MRSK continues to run commands in parallel across all hosts. +These settings only apply when booting containers (using `mrsk deploy`, or `mrsk app boot`). For other commands, MRSK continues to run commands in parallel across all hosts. ## Stage of development diff --git a/lib/mrsk/cli/app.rb b/lib/mrsk/cli/app.rb index b6c9e1b8..735fef88 100644 --- a/lib/mrsk/cli/app.rb +++ b/lib/mrsk/cli/app.rb @@ -39,7 +39,7 @@ class Mrsk::Cli::App < Mrsk::Cli::Base desc "start", "Start existing app container on servers" def start with_lock do - on(MRSK.hosts, **MRSK.config.group_strategy) do |host| + on(MRSK.hosts) do |host| roles = MRSK.roles_on(host) roles.each do |role| From a8726be20eda76c68f814c9abfa3854b694b7ecb Mon Sep 17 00:00:00 2001 From: Kevin McConnell Date: Fri, 14 Apr 2023 11:01:25 +0100 Subject: [PATCH 03/17] Move `group_limit` & `group_wait` under `boot` Also make formatting the group strategy the responsibility of the commander. --- README.md | 7 ++++--- lib/mrsk/cli/app.rb | 2 +- lib/mrsk/commander.rb | 8 ++++++++ lib/mrsk/configuration.rb | 21 ++++---------------- lib/mrsk/configuration/boot.rb | 9 +++++++++ test/fixtures/deploy_with_group_strategy.yml | 5 +++-- 6 files changed, 29 insertions(+), 23 deletions(-) create mode 100644 lib/mrsk/configuration/boot.rb diff --git a/README.md b/README.md index e3671f5c..9fcea1ed 100644 --- a/README.md +++ b/README.md @@ -835,13 +835,14 @@ mrsk lock release When deploying to large numbers of hosts, you might prefer not to restart your services on every host at the same time. -MRSK's default is to boot new containers on all hosts in parallel. But you can control this by configuring `group_limit` and `group_wait`. +MRSK's default is to boot new containers on all hosts in parallel. But you can control this by configuring `group_limit` and `group_wait` as boot options: ```yaml service: myservice -group_limit: 10 -group_wait: 30 +boot: + group_limit: 10 + group_wait: 2 ``` When `group_limit` is specified, containers will be booted on, at most, `group_limit` hosts at once. MRSK will pause for `group_wait` seconds between batches. diff --git a/lib/mrsk/cli/app.rb b/lib/mrsk/cli/app.rb index 735fef88..7f6fbf44 100644 --- a/lib/mrsk/cli/app.rb +++ b/lib/mrsk/cli/app.rb @@ -13,7 +13,7 @@ class Mrsk::Cli::App < Mrsk::Cli::Base execute *MRSK.app.tag_current_as_latest end - on(MRSK.hosts, **MRSK.config.group_strategy) do |host| + on(MRSK.hosts, **MRSK.group_strategy) do |host| roles = MRSK.roles_on(host) roles.each do |role| diff --git a/lib/mrsk/commander.rb b/lib/mrsk/commander.rb index 44c96a2a..160b619e 100644 --- a/lib/mrsk/commander.rb +++ b/lib/mrsk/commander.rb @@ -51,6 +51,14 @@ class Mrsk::Commander end end + def group_strategy + if config.boot.group_limit.present? + { in: :groups, limit: config.boot.group_limit, wait: config.boot.group_wait } + else + {} + end + end + def roles_on(host) roles.select { |role| role.hosts.include?(host.to_s) }.map(&:name) end diff --git a/lib/mrsk/configuration.rb b/lib/mrsk/configuration.rb index 388bcce4..c3415703 100644 --- a/lib/mrsk/configuration.rb +++ b/lib/mrsk/configuration.rb @@ -87,6 +87,10 @@ class Mrsk::Configuration roles.select(&:running_traefik?).flat_map(&:hosts).uniq end + def boot + Mrsk::Configuration::Boot.new(section: raw_config.boot) + end + def repository [ raw_config.registry["server"], image ].compact.join("/") @@ -153,15 +157,6 @@ class Mrsk::Configuration end - def group_strategy - if group_limit.present? - { in: :groups, limit: group_limit, wait: group_wait } - else - {} - end - end - - def audit_broadcast_cmd raw_config.audit_broadcast_cmd end @@ -246,12 +241,4 @@ class Mrsk::Configuration raise "Can't use commit hash as version, no git repository found in #{Dir.pwd}" end end - - def group_limit - raw_config.group_limit&.to_i - end - - def group_wait - raw_config.group_wait&.to_i - end end diff --git a/lib/mrsk/configuration/boot.rb b/lib/mrsk/configuration/boot.rb new file mode 100644 index 00000000..150f3d5f --- /dev/null +++ b/lib/mrsk/configuration/boot.rb @@ -0,0 +1,9 @@ +class Mrsk::Configuration::Boot + attr_reader :group_wait, :group_limit + + def initialize(section:) + section = section || {} + @group_limit = section["group_limit"] + @group_wait = section["group_wait"] + end +end diff --git a/test/fixtures/deploy_with_group_strategy.yml b/test/fixtures/deploy_with_group_strategy.yml index bc551f8a..a082f539 100644 --- a/test/fixtures/deploy_with_group_strategy.yml +++ b/test/fixtures/deploy_with_group_strategy.yml @@ -12,5 +12,6 @@ registry: username: user password: pw -group_limit: 3 -group_wait: 30 +boot: + group_limit: 3 + group_wait: 30 From f055766918f9fe9fa4b91192c937f26dc61bcd5a Mon Sep 17 00:00:00 2001 From: Kevin McConnell Date: Fri, 14 Apr 2023 11:26:10 +0100 Subject: [PATCH 04/17] Allow percentage-based rolling deployments --- README.md | 2 +- lib/mrsk/configuration.rb | 2 +- lib/mrsk/configuration/boot.rb | 20 ++++++++++---- test/cli/app_test.rb | 2 +- test/commander_test.rb | 27 ++++++++++++++++--- test/fixtures/deploy_with_group_strategy.yml | 2 +- .../deploy_with_precentage_group_strategy.yml | 17 ++++++++++++ 7 files changed, 60 insertions(+), 12 deletions(-) create mode 100644 test/fixtures/deploy_with_precentage_group_strategy.yml diff --git a/README.md b/README.md index 9fcea1ed..093d43bf 100644 --- a/README.md +++ b/README.md @@ -841,7 +841,7 @@ MRSK's default is to boot new containers on all hosts in parallel. But you can c service: myservice boot: - group_limit: 10 + group_limit: 10 # Can also specify as a percentage of total hosts, such as "25%" group_wait: 2 ``` diff --git a/lib/mrsk/configuration.rb b/lib/mrsk/configuration.rb index c3415703..4d2d010a 100644 --- a/lib/mrsk/configuration.rb +++ b/lib/mrsk/configuration.rb @@ -88,7 +88,7 @@ class Mrsk::Configuration end def boot - Mrsk::Configuration::Boot.new(section: raw_config.boot) + Mrsk::Configuration::Boot.new(config: self) end diff --git a/lib/mrsk/configuration/boot.rb b/lib/mrsk/configuration/boot.rb index 150f3d5f..dd86b689 100644 --- a/lib/mrsk/configuration/boot.rb +++ b/lib/mrsk/configuration/boot.rb @@ -1,9 +1,19 @@ class Mrsk::Configuration::Boot - attr_reader :group_wait, :group_limit + def initialize(config:) + @options = config.raw_config.boot || {} + @host_count = config.all_hosts.count + end - def initialize(section:) - section = section || {} - @group_limit = section["group_limit"] - @group_wait = section["group_wait"] + def group_limit + limit = @options["group_limit"] + if limit.to_s.end_with?("%") + @host_count * limit.to_i / 100 + else + limit + end + end + + def group_wait + @options["group_wait"] end end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index f27c04ea..dc2594b9 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -38,7 +38,7 @@ class CliAppTest < CliTestCase Mrsk::Cli::App.any_instance.stubs(:on).with([ "1.1.1.1" ]) # tag container # Strategy is used when booting the containers - Mrsk::Cli::App.any_instance.expects(:on).with([ "1.1.1.1" ], in: :groups, limit: 3, wait: 30).with_block_given + Mrsk::Cli::App.any_instance.expects(:on).with([ "1.1.1.1" ], in: :groups, limit: 3, wait: 2).with_block_given run_command("boot", config: :with_group_strategy) end diff --git a/test/commander_test.rb b/test/commander_test.rb index 163feaf7..e5fd2961 100644 --- a/test/commander_test.rb +++ b/test/commander_test.rb @@ -2,9 +2,7 @@ require "test_helper" class CommanderTest < ActiveSupport::TestCase setup do - @mrsk = Mrsk::Commander.new.tap do |mrsk| - mrsk.configure config_file: Pathname.new(File.expand_path("fixtures/deploy_with_roles.yml", __dir__)) - end + configure_with(:deploy_with_roles) end test "lazy configuration" do @@ -55,4 +53,27 @@ class CommanderTest < ActiveSupport::TestCase assert_equal [ "web" ], @mrsk.roles_on("1.1.1.1") assert_equal [ "workers" ], @mrsk.roles_on("1.1.1.3") end + + test "default group strategy" do + assert_empty @mrsk.group_strategy + end + + test "specific limit group strategy" do + configure_with(:deploy_with_group_strategy) + + assert_equal({ in: :groups, limit: 3, wait: 2 }, @mrsk.group_strategy) + end + + test "percentage-based group strategy" do + configure_with(:deploy_with_precentage_group_strategy) + + assert_equal({ in: :groups, limit: 1, wait: 2 }, @mrsk.group_strategy) + end + + private + def configure_with(variant) + @mrsk = Mrsk::Commander.new.tap do |mrsk| + mrsk.configure config_file: Pathname.new(File.expand_path("fixtures/#{variant}.yml", __dir__)) + end + end end diff --git a/test/fixtures/deploy_with_group_strategy.yml b/test/fixtures/deploy_with_group_strategy.yml index a082f539..91ae3cc0 100644 --- a/test/fixtures/deploy_with_group_strategy.yml +++ b/test/fixtures/deploy_with_group_strategy.yml @@ -14,4 +14,4 @@ registry: boot: group_limit: 3 - group_wait: 30 + group_wait: 2 diff --git a/test/fixtures/deploy_with_precentage_group_strategy.yml b/test/fixtures/deploy_with_precentage_group_strategy.yml new file mode 100644 index 00000000..e738d07d --- /dev/null +++ b/test/fixtures/deploy_with_precentage_group_strategy.yml @@ -0,0 +1,17 @@ +service: app +image: dhh/app +servers: + web: + - "1.1.1.1" + - "1.1.1.2" + workers: + - "1.1.1.3" + - "1.1.1.4" + +registry: + username: user + password: pw + +boot: + group_limit: 25% + group_wait: 2 From a77428143fbbdbd5b8000aa7411eb34728a7fc09 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Fri, 28 Apr 2023 14:19:15 +0100 Subject: [PATCH 05/17] Fix the integration test healthcheck The alpine nginx container doesn't contain curl, so let's override the healthcheck command to use wget. --- test/integration/docker/deployer/app/config/deploy.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/integration/docker/deployer/app/config/deploy.yml b/test/integration/docker/deployer/app/config/deploy.yml index 5ac25b14..bd97dc36 100644 --- a/test/integration/docker/deployer/app/config/deploy.yml +++ b/test/integration/docker/deployer/app/config/deploy.yml @@ -10,5 +10,4 @@ registry: builder: multiarch: false healthcheck: - path: / - port: 80 + cmd: wget -qO- http://localhost > /dev/null From 494a1ae089c66fe183ee5a1f4f38e5d725063e77 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 12:11:15 +0100 Subject: [PATCH 06/17] Report on container health after failure --- test/integration/deploy_test.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/integration/deploy_test.rb b/test/integration/deploy_test.rb index 89b2269f..0fb94d27 100644 --- a/test/integration/deploy_test.rb +++ b/test/integration/deploy_test.rb @@ -61,7 +61,10 @@ class DeployTest < ActiveSupport::TestCase def wait_for_healthy(timeout: 20) timeout_at = Time.now + timeout while docker_compose("ps -a | tail -n +2 | grep -v '(healthy)' | wc -l", capture: true) != "0" - raise "Container not healthy after #{timeout} seconds" if timeout_at < Time.now + if timeout_at < Time.now + docker_compose("ps -a | tail -n +2 | grep -v '(healthy)'") + raise "Container not healthy after #{timeout} seconds" if timeout_at < Time.now + end sleep 0.1 end end From ca2e2bac2ecaa73b97fafe455c9a2ed555b3c4b2 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 12:50:45 +0100 Subject: [PATCH 07/17] Fix missing for apt-get --- test/integration/docker/deployer/Dockerfile | 4 ++-- test/integration/docker/shared/Dockerfile | 2 +- test/integration/docker/vm/Dockerfile | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/integration/docker/deployer/Dockerfile b/test/integration/docker/deployer/Dockerfile index 22556cc2..ccd1de39 100644 --- a/test/integration/docker/deployer/Dockerfile +++ b/test/integration/docker/deployer/Dockerfile @@ -2,7 +2,7 @@ FROM ruby:3.2 WORKDIR /app -RUN apt-get update && apt-get install -y ca-certificates openssh-client curl gnupg docker.io +RUN apt-get update --fix-missing && apt-get install -y ca-certificates openssh-client curl gnupg docker.io RUN install -m 0755 -d /etc/apt/keyrings RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg @@ -12,7 +12,7 @@ RUN echo \ "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ tee /etc/apt/sources.list.d/docker.list > /dev/null -RUN apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +RUN apt-get update --fix-missing && apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin COPY boot.sh . COPY app/ . diff --git a/test/integration/docker/shared/Dockerfile b/test/integration/docker/shared/Dockerfile index dae69053..bc0d8e84 100644 --- a/test/integration/docker/shared/Dockerfile +++ b/test/integration/docker/shared/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:22.10 WORKDIR /work -RUN apt-get update && apt-get -y install openssh-client openssl +RUN apt-get update --fix-missing && apt-get -y install openssh-client openssl RUN mkdir ssh && \ ssh-keygen -t rsa -f ssh/id_rsa -N "" diff --git a/test/integration/docker/vm/Dockerfile b/test/integration/docker/vm/Dockerfile index 99f881fa..f481023c 100644 --- a/test/integration/docker/vm/Dockerfile +++ b/test/integration/docker/vm/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:22.10 WORKDIR /work -RUN apt-get update && apt-get -y install openssh-client openssh-server docker.io +RUN apt-get update --fix-missing && apt-get -y install openssh-client openssh-server docker.io RUN mkdir /root/.ssh && ln -s /shared/ssh/id_rsa.pub /root/.ssh/authorized_keys RUN mkdir -p /etc/docker/certs.d/registry:4443 && ln -s /shared/certs/domain.crt /etc/docker/certs.d/registry:4443/ca.crt From 548a1019c14785f572710b94c3bc4fea7b9212cd Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 18:21:22 +0100 Subject: [PATCH 08/17] Dump traefik logs when app not booted --- test/integration/deploy_test.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/integration/deploy_test.rb b/test/integration/deploy_test.rb index 0fb94d27..1661b23a 100644 --- a/test/integration/deploy_test.rb +++ b/test/integration/deploy_test.rb @@ -51,7 +51,12 @@ class DeployTest < ActiveSupport::TestCase end def assert_app_is_up - assert_equal "200", app_response.code + code = app_response.code + if code != "200" + puts "Got response code #{code}, here are the traefik logs:" + mrsk :traefik, :logs + end + assert_equal "200", code end def app_response From 94f87eddedfd88f011e10128602de3ddaea3727b Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 18:27:08 +0100 Subject: [PATCH 09/17] Also dump load balancer logs --- test/integration/deploy_test.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/integration/deploy_test.rb b/test/integration/deploy_test.rb index 1661b23a..ec894caf 100644 --- a/test/integration/deploy_test.rb +++ b/test/integration/deploy_test.rb @@ -55,6 +55,8 @@ class DeployTest < ActiveSupport::TestCase if code != "200" puts "Got response code #{code}, here are the traefik logs:" mrsk :traefik, :logs + puts "Add here are the load balancer logs" + docker_compose :logs, :load_balancer end assert_equal "200", code end From 1170e2311eef53fe6433071f8d418381a16dee98 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 18:32:07 +0100 Subject: [PATCH 10/17] Check if we are still getting a 404 --- test/integration/deploy_test.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/integration/deploy_test.rb b/test/integration/deploy_test.rb index ec894caf..351dc5d2 100644 --- a/test/integration/deploy_test.rb +++ b/test/integration/deploy_test.rb @@ -55,8 +55,9 @@ class DeployTest < ActiveSupport::TestCase if code != "200" puts "Got response code #{code}, here are the traefik logs:" mrsk :traefik, :logs - puts "Add here are the load balancer logs" + puts "And here are the load balancer logs" docker_compose :logs, :load_balancer + puts "Tried to get the response code again and got #{app_response.code}" end assert_equal "200", code end From 650f9b1fbf7596213309b8ba6e4eb8c7fd5bb600 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 18:55:10 +0100 Subject: [PATCH 11/17] Include traefik access logs --- test/integration/docker/deployer/app/config/deploy.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/integration/docker/deployer/app/config/deploy.yml b/test/integration/docker/deployer/app/config/deploy.yml index bd97dc36..8442c57b 100644 --- a/test/integration/docker/deployer/app/config/deploy.yml +++ b/test/integration/docker/deployer/app/config/deploy.yml @@ -11,3 +11,7 @@ builder: multiarch: false healthcheck: cmd: wget -qO- http://localhost > /dev/null +traefik: + args: + accesslog: true + accesslog.format: json From d0f66db33c7de2004b601deb796f24cb97fe871e Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 18:58:46 +0100 Subject: [PATCH 12/17] Extend traefik delay by 1 second --- lib/mrsk/utils/healthcheck_poller.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mrsk/utils/healthcheck_poller.rb b/lib/mrsk/utils/healthcheck_poller.rb index d7b8be65..3ef5b7a8 100644 --- a/lib/mrsk/utils/healthcheck_poller.rb +++ b/lib/mrsk/utils/healthcheck_poller.rb @@ -1,5 +1,5 @@ class Mrsk::Utils::HealthcheckPoller - TRAEFIK_HEALTHY_DELAY = 1 + TRAEFIK_HEALTHY_DELAY = 2 class HealthcheckError < StandardError; end From a72f95f44df815a76ba00729fdf538c68d60087c Mon Sep 17 00:00:00 2001 From: Kevin McConnell Date: Mon, 1 May 2023 14:34:01 +0100 Subject: [PATCH 13/17] Ensure Traefik service name is consistent If we don't specify any service properties when labelling containers, the generated service will be named according to the container. However, we change the container name on every deployment (as it is versioned), which means that the auto-generated service name will be different in each container. That is a problem for two reasons: - Multiple containers share a common router while a deployment is happening. At this point, the router configuration will be different between the containers; Traefik flags this as an error, and stops routing to the containers until it's resolved. - We allow custom labels to be set in an app's config. In order to define custom configuration on the service, we'll need to know what it will be called. Changed to force the service name by setting one of its properties. --- lib/mrsk/configuration/role.rb | 3 +++ test/commands/app_test.rb | 12 ++++++------ test/configuration/role_test.rb | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/lib/mrsk/configuration/role.rb b/lib/mrsk/configuration/role.rb index 881b331e..55b69731 100644 --- a/lib/mrsk/configuration/role.rb +++ b/lib/mrsk/configuration/role.rb @@ -89,6 +89,9 @@ class Mrsk::Configuration::Role def traefik_labels if running_traefik? { + # Setting a service property ensures that the generated service name will be consistent between versions + "traefik.http.services.#{traefik_service}.loadbalancer.server.scheme" => "http", + "traefik.http.routers.#{traefik_service}.rule" => "PathPrefix(`/`)", "traefik.http.middlewares.#{traefik_service}-retry.retry.attempts" => "5", "traefik.http.middlewares.#{traefik_service}-retry.retry.initialinterval" => "500ms", diff --git a/test/commands/app_test.rb b/test/commands/app_test.rb index f74b21c9..44404558 100644 --- a/test/commands/app_test.rb +++ b/test/commands/app_test.rb @@ -13,7 +13,7 @@ class CommandsAppTest < ActiveSupport::TestCase test "run" do assert_equal \ - "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", + "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.services.app-web.loadbalancer.server.scheme=\"http\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", new_command.run.join(" ") end @@ -21,7 +21,7 @@ class CommandsAppTest < ActiveSupport::TestCase @config[:volumes] = ["/local/path:/container/path" ] assert_equal \ - "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --log-opt max-size=\"10m\" --volume /local/path:/container/path --label service=\"app\" --label role=\"web\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", + "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --log-opt max-size=\"10m\" --volume /local/path:/container/path --label service=\"app\" --label role=\"web\" --label traefik.http.services.app-web.loadbalancer.server.scheme=\"http\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", new_command.run.join(" ") end @@ -29,7 +29,7 @@ class CommandsAppTest < ActiveSupport::TestCase @config[:healthcheck] = { "path" => "/healthz" } assert_equal \ - "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/healthz || exit 1\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", + "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/healthz || exit 1\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.services.app-web.loadbalancer.server.scheme=\"http\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", new_command.run.join(" ") end @@ -37,7 +37,7 @@ class CommandsAppTest < ActiveSupport::TestCase @config[:healthcheck] = { "cmd" => "/bin/up" } assert_equal \ - "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"/bin/up\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", + "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"/bin/up\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.services.app-web.loadbalancer.server.scheme=\"http\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", new_command.run.join(" ") end @@ -45,7 +45,7 @@ class CommandsAppTest < ActiveSupport::TestCase @config[:servers] = { "web" => { "hosts" => [ "1.1.1.1" ], "healthcheck" => { "cmd" => "/bin/healthy" } } } assert_equal \ - "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"/bin/healthy\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", + "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"/bin/healthy\" --health-interval \"1s\" --log-opt max-size=\"10m\" --label service=\"app\" --label role=\"web\" --label traefik.http.services.app-web.loadbalancer.server.scheme=\"http\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", new_command.run.join(" ") end @@ -60,7 +60,7 @@ class CommandsAppTest < ActiveSupport::TestCase @config[:logging] = { "driver" => "local", "options" => { "max-size" => "100m", "max-file" => "3" } } assert_equal \ - "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --log-driver \"local\" --log-opt max-size=\"100m\" --log-opt max-file=\"3\" --label service=\"app\" --label role=\"web\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", + "docker run --detach --restart unless-stopped --name app-web-999 -e MRSK_CONTAINER_NAME=\"app-web-999\" -e RAILS_MASTER_KEY=\"456\" --health-cmd \"curl -f http://localhost:3000/up || exit 1\" --health-interval \"1s\" --log-driver \"local\" --log-opt max-size=\"100m\" --log-opt max-file=\"3\" --label service=\"app\" --label role=\"web\" --label traefik.http.services.app-web.loadbalancer.server.scheme=\"http\" --label traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\" --label traefik.http.middlewares.app-web-retry.retry.attempts=\"5\" --label traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\" --label traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\" dhh/app:999", new_command.run.join(" ") end diff --git a/test/configuration/role_test.rb b/test/configuration/role_test.rb index f3814066..7708bd9d 100644 --- a/test/configuration/role_test.rb +++ b/test/configuration/role_test.rb @@ -42,7 +42,7 @@ class ConfigurationRoleTest < ActiveSupport::TestCase end test "special label args for web" do - assert_equal [ "--label", "service=\"app\"", "--label", "role=\"web\"", "--label", "traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\"", "--label", "traefik.http.middlewares.app-web-retry.retry.attempts=\"5\"", "--label", "traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\"", "--label", "traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\"" ], @config.role(:web).label_args + assert_equal [ "--label", "service=\"app\"", "--label", "role=\"web\"", "--label", "traefik.http.services.app-web.loadbalancer.server.scheme=\"http\"", "--label", "traefik.http.routers.app-web.rule=\"PathPrefix(\\`/\\`)\"", "--label", "traefik.http.middlewares.app-web-retry.retry.attempts=\"5\"", "--label", "traefik.http.middlewares.app-web-retry.retry.initialinterval=\"500ms\"", "--label", "traefik.http.routers.app-web.middlewares=\"app-web-retry@docker\"" ], @config.role(:web).label_args end test "custom labels" do @@ -66,7 +66,7 @@ class ConfigurationRoleTest < ActiveSupport::TestCase c[:servers]["beta"] = { "traefik" => "true", "hosts" => [ "1.1.1.5" ] } }) - assert_equal [ "--label", "service=\"app\"", "--label", "role=\"beta\"", "--label", "traefik.http.routers.app-beta.rule=\"PathPrefix(\\`/\\`)\"", "--label", "traefik.http.middlewares.app-beta-retry.retry.attempts=\"5\"", "--label", "traefik.http.middlewares.app-beta-retry.retry.initialinterval=\"500ms\"", "--label", "traefik.http.routers.app-beta.middlewares=\"app-beta-retry@docker\"" ], config.role(:beta).label_args + assert_equal [ "--label", "service=\"app\"", "--label", "role=\"beta\"", "--label", "traefik.http.services.app-beta.loadbalancer.server.scheme=\"http\"", "--label", "traefik.http.routers.app-beta.rule=\"PathPrefix(\\`/\\`)\"", "--label", "traefik.http.middlewares.app-beta-retry.retry.attempts=\"5\"", "--label", "traefik.http.middlewares.app-beta-retry.retry.initialinterval=\"500ms\"", "--label", "traefik.http.routers.app-beta.middlewares=\"app-beta-retry@docker\"" ], config.role(:beta).label_args end test "env overwritten by role" do From 7fe24d5048f611b01076abfda26fbbd74df9b7d6 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 16:34:32 +0100 Subject: [PATCH 14/17] Check all hosts before rolling back Hosts could end up out of sync with each other if prune commands are run manually or when new hosts are added. Before rolling back confirm that the required container is available on all hosts and roles. --- lib/mrsk/cli/main.rb | 23 ++++++++++++++++------- test/cli/main_test.rb | 19 +++++++++++++++---- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/lib/mrsk/cli/main.rb b/lib/mrsk/cli/main.rb index 0a3d2ce0..0ead3285 100644 --- a/lib/mrsk/cli/main.rb +++ b/lib/mrsk/cli/main.rb @@ -236,15 +236,24 @@ class Mrsk::Cli::Main < Mrsk::Cli::Base subcommand "lock", Mrsk::Cli::Lock private - def container_available?(version, host: MRSK.primary_host) - available = nil - - on(host) do - first_role = MRSK.roles_on(host).first - available = capture_with_info(*MRSK.app(role: first_role).container_id_for_version(version)).present? + def container_available?(version) + begin + on(MRSK.hosts) do + MRSK.roles_on(host).each do |role| + container_id = capture_with_info(*MRSK.app(role: role).container_id_for_version(version)) + raise "Container not found" unless container_id.present? + end + end + rescue SSHKit::Runner::ExecuteError => e + if e.message =~ /Container not found/ + say "Cannot rollback: #{e.message}" + return false + else + raise + end end - available + true end def deploy_options diff --git a/test/cli/main_test.rb b/test/cli/main_test.rb index 5cac1132..9ba0e54f 100644 --- a/test/cli/main_test.rb +++ b/test/cli/main_test.rb @@ -145,7 +145,8 @@ class CliMainTest < CliTestCase end test "rollback bad version" do - # Mrsk::Cli::Main.any_instance.stubs(:container_available?).returns(false) + Thread.report_on_exception = false + run_command("details") # Preheat MRSK const run_command("rollback", "nonsense").tap do |output| @@ -155,9 +156,19 @@ class CliMainTest < CliTestCase end test "rollback good version" do - Mrsk::Cli::Main.any_instance.stubs(:container_available?).returns(true) - SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info).with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--latest", "--format", "\"{{.Names}}\"", "|", "grep -oE \"\\-[^-]+$\"", "|", "cut -c 2-").returns("version-to-rollback\n").at_least_once - SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info).with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=workers", "--filter", "status=running", "--latest", "--format", "\"{{.Names}}\"", "|", "grep -oE \"\\-[^-]+$\"", "|", "cut -c 2-").returns("version-to-rollback\n").at_least_once + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet") + .returns("version-to-rollback\n").at_least_once + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-123$", "--quiet") + .returns("version-to-rollback\n").at_least_once + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--latest", "--format", "\"{{.Names}}\"", "|", "grep -oE \"\\-[^-]+$\"", "|", "cut -c 2-") + .returns("version-to-rollback\n").at_least_once + SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info) + .with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=workers", "--filter", "status=running", "--latest", "--format", "\"{{.Names}}\"", "|", "grep -oE \"\\-[^-]+$\"", "|", "cut -c 2-") + .returns("version-to-rollback\n").at_least_once + run_command("rollback", "123", config_file: "deploy_with_accessories").tap do |output| assert_match "Start version 123", output From 86d6f8d674b0ff54a759123f6aba417b8cc6d9b1 Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 17:09:13 +0100 Subject: [PATCH 15/17] Don't assume rolling back in message --- lib/mrsk/cli/main.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mrsk/cli/main.rb b/lib/mrsk/cli/main.rb index 0ead3285..980d26c4 100644 --- a/lib/mrsk/cli/main.rb +++ b/lib/mrsk/cli/main.rb @@ -246,7 +246,7 @@ class Mrsk::Cli::Main < Mrsk::Cli::Base end rescue SSHKit::Runner::ExecuteError => e if e.message =~ /Container not found/ - say "Cannot rollback: #{e.message}" + say "Error looking for container version #{version}: #{e.message}" return false else raise From 971a91da1569798bb10cb2cb63e64a395b18039c Mon Sep 17 00:00:00 2001 From: Donal McBreen Date: Mon, 1 May 2023 14:48:19 +0100 Subject: [PATCH 16/17] Retain a fixed number of containers when pruning Time based container and image retention can have variable space requirements depending on how often we deploy. - Only prune stopped containers, retaining the 5 newest - Then prune dangling images so we only keep images for the retained containers. --- lib/mrsk/cli/prune.rb | 4 ++-- lib/mrsk/commands/prune.rb | 16 ++++++++++++---- test/cli/prune_test.rb | 4 ++-- test/commands/prune_test.rb | 4 ++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/lib/mrsk/cli/prune.rb b/lib/mrsk/cli/prune.rb index 6698ba27..bcfdd5bf 100644 --- a/lib/mrsk/cli/prune.rb +++ b/lib/mrsk/cli/prune.rb @@ -7,7 +7,7 @@ class Mrsk::Cli::Prune < Mrsk::Cli::Base end end - desc "images", "Prune unused images older than 7 days" + desc "images", "Prune dangling images" def images with_lock do on(MRSK.hosts) do @@ -17,7 +17,7 @@ class Mrsk::Cli::Prune < Mrsk::Cli::Base end end - desc "containers", "Prune stopped containers older than 3 days" + desc "containers", "Prune all stopped containers, except the last 5" def containers with_lock do on(MRSK.hosts) do diff --git a/lib/mrsk/commands/prune.rb b/lib/mrsk/commands/prune.rb index 71218cda..0ac779d5 100644 --- a/lib/mrsk/commands/prune.rb +++ b/lib/mrsk/commands/prune.rb @@ -2,11 +2,19 @@ require "active_support/duration" require "active_support/core_ext/numeric/time" class Mrsk::Commands::Prune < Mrsk::Commands::Base - def images(until_hours: 7.days.in_hours.to_i) - docker :image, :prune, "--all", "--force", "--filter", "label=service=#{config.service}", "--filter", "until=#{until_hours}h" + def images + docker :image, :prune, "--all", "--force", "--filter", "label=service=#{config.service}", "--filter", "dangling=true" end - def containers(until_hours: 3.days.in_hours.to_i) - docker :container, :prune, "--force", "--filter", "label=service=#{config.service}", "--filter", "until=#{until_hours}h" + def containers(keep_last: 5) + pipe \ + docker(:ps, "-q", "-a", "--filter", "label=service=#{config.service}", *stopped_containers_filters), + "tail -n +#{keep_last + 1}", + "while read container_id; do docker rm $container_id; done" end + + private + def stopped_containers_filters + [ "created", "exited", "dead" ].flat_map { |status| ["--filter", "status=#{status}"] } + end end diff --git a/test/cli/prune_test.rb b/test/cli/prune_test.rb index 4ff9eedc..40a1d80f 100644 --- a/test/cli/prune_test.rb +++ b/test/cli/prune_test.rb @@ -10,13 +10,13 @@ class CliPruneTest < CliTestCase test "images" do run_command("images").tap do |output| - assert_match /docker image prune --all --force --filter label=service=app --filter until=168h on 1.1.1.\d/, output + assert_match /docker image prune --all --force --filter label=service=app --filter dangling=true on 1.1.1.\d/, output end end test "containers" do run_command("containers").tap do |output| - assert_match /docker container prune --force --filter label=service=app --filter until=72h on 1.1.1.\d/, output + assert_match /docker ps -q -a --filter label=service=app --filter status=created --filter status=exited --filter status=dead | tail -n +6 | while read container_id; do docker rm $container_id; done on 1.1.1.\d/, output end end diff --git a/test/commands/prune_test.rb b/test/commands/prune_test.rb index adbe2816..bd6a561d 100644 --- a/test/commands/prune_test.rb +++ b/test/commands/prune_test.rb @@ -10,13 +10,13 @@ class CommandsPruneTest < ActiveSupport::TestCase test "images" do assert_equal \ - "docker image prune --all --force --filter label=service=app --filter until=168h", + "docker image prune --all --force --filter label=service=app --filter dangling=true", new_command.images.join(" ") end test "containers" do assert_equal \ - "docker container prune --force --filter label=service=app --filter until=72h", + "docker ps -q -a --filter label=service=app --filter status=created --filter status=exited --filter status=dead | tail -n +6 | while read container_id; do docker rm $container_id; done", new_command.containers.join(" ") end From c83b74dcb74d4388555b0a8f6c426e68216c223a Mon Sep 17 00:00:00 2001 From: David Heinemeier Hansson Date: Tue, 2 May 2023 13:11:31 +0200 Subject: [PATCH 17/17] Simplify domain language to just "boot" and unscoped config keys --- README.md | 8 ++++---- lib/mrsk/cli/app.rb | 2 +- lib/mrsk/commander.rb | 6 +++--- lib/mrsk/configuration/boot.rb | 9 +++++---- test/cli/app_test.rb | 2 +- test/commander_test.rb | 10 +++++----- ...roup_strategy.yml => deploy_with_boot_strategy.yml} | 4 ++-- ...gy.yml => deploy_with_precentage_boot_strategy.yml} | 4 ++-- 8 files changed, 23 insertions(+), 22 deletions(-) rename test/fixtures/{deploy_with_group_strategy.yml => deploy_with_boot_strategy.yml} (83%) rename test/fixtures/{deploy_with_precentage_group_strategy.yml => deploy_with_precentage_boot_strategy.yml} (82%) diff --git a/README.md b/README.md index 093d43bf..4cb298fa 100644 --- a/README.md +++ b/README.md @@ -835,17 +835,17 @@ mrsk lock release When deploying to large numbers of hosts, you might prefer not to restart your services on every host at the same time. -MRSK's default is to boot new containers on all hosts in parallel. But you can control this by configuring `group_limit` and `group_wait` as boot options: +MRSK's default is to boot new containers on all hosts in parallel. But you can control this by configuring `boot/limit` and `boot/wait` as options: ```yaml service: myservice boot: - group_limit: 10 # Can also specify as a percentage of total hosts, such as "25%" - group_wait: 2 + limit: 10 # Can also specify as a percentage of total hosts, such as "25%" + wait: 2 ``` -When `group_limit` is specified, containers will be booted on, at most, `group_limit` hosts at once. MRSK will pause for `group_wait` seconds between batches. +When `limit` is specified, containers will be booted on, at most, `limit` hosts at once. MRSK will pause for `wait` seconds between batches. These settings only apply when booting containers (using `mrsk deploy`, or `mrsk app boot`). For other commands, MRSK continues to run commands in parallel across all hosts. diff --git a/lib/mrsk/cli/app.rb b/lib/mrsk/cli/app.rb index 7f6fbf44..3722ce6e 100644 --- a/lib/mrsk/cli/app.rb +++ b/lib/mrsk/cli/app.rb @@ -13,7 +13,7 @@ class Mrsk::Cli::App < Mrsk::Cli::Base execute *MRSK.app.tag_current_as_latest end - on(MRSK.hosts, **MRSK.group_strategy) do |host| + on(MRSK.hosts, **MRSK.boot_strategy) do |host| roles = MRSK.roles_on(host) roles.each do |role| diff --git a/lib/mrsk/commander.rb b/lib/mrsk/commander.rb index 160b619e..217918e6 100644 --- a/lib/mrsk/commander.rb +++ b/lib/mrsk/commander.rb @@ -51,9 +51,9 @@ class Mrsk::Commander end end - def group_strategy - if config.boot.group_limit.present? - { in: :groups, limit: config.boot.group_limit, wait: config.boot.group_wait } + def boot_strategy + if config.boot.limit.present? + { in: :groups, limit: config.boot.limit, wait: config.boot.wait } else {} end diff --git a/lib/mrsk/configuration/boot.rb b/lib/mrsk/configuration/boot.rb index dd86b689..1332398a 100644 --- a/lib/mrsk/configuration/boot.rb +++ b/lib/mrsk/configuration/boot.rb @@ -4,8 +4,9 @@ class Mrsk::Configuration::Boot @host_count = config.all_hosts.count end - def group_limit - limit = @options["group_limit"] + def limit + limit = @options["limit"] + if limit.to_s.end_with?("%") @host_count * limit.to_i / 100 else @@ -13,7 +14,7 @@ class Mrsk::Configuration::Boot end end - def group_wait - @options["group_wait"] + def wait + @options["wait"] end end diff --git a/test/cli/app_test.rb b/test/cli/app_test.rb index dc2594b9..3dddb078 100644 --- a/test/cli/app_test.rb +++ b/test/cli/app_test.rb @@ -40,7 +40,7 @@ class CliAppTest < CliTestCase # Strategy is used when booting the containers Mrsk::Cli::App.any_instance.expects(:on).with([ "1.1.1.1" ], in: :groups, limit: 3, wait: 2).with_block_given - run_command("boot", config: :with_group_strategy) + run_command("boot", config: :with_boot_strategy) end test "start" do diff --git a/test/commander_test.rb b/test/commander_test.rb index e5fd2961..25dfabdd 100644 --- a/test/commander_test.rb +++ b/test/commander_test.rb @@ -55,19 +55,19 @@ class CommanderTest < ActiveSupport::TestCase end test "default group strategy" do - assert_empty @mrsk.group_strategy + assert_empty @mrsk.boot_strategy end test "specific limit group strategy" do - configure_with(:deploy_with_group_strategy) + configure_with(:deploy_with_boot_strategy) - assert_equal({ in: :groups, limit: 3, wait: 2 }, @mrsk.group_strategy) + assert_equal({ in: :groups, limit: 3, wait: 2 }, @mrsk.boot_strategy) end test "percentage-based group strategy" do - configure_with(:deploy_with_precentage_group_strategy) + configure_with(:deploy_with_precentage_boot_strategy) - assert_equal({ in: :groups, limit: 1, wait: 2 }, @mrsk.group_strategy) + assert_equal({ in: :groups, limit: 1, wait: 2 }, @mrsk.boot_strategy) end private diff --git a/test/fixtures/deploy_with_group_strategy.yml b/test/fixtures/deploy_with_boot_strategy.yml similarity index 83% rename from test/fixtures/deploy_with_group_strategy.yml rename to test/fixtures/deploy_with_boot_strategy.yml index 91ae3cc0..7691eb2e 100644 --- a/test/fixtures/deploy_with_group_strategy.yml +++ b/test/fixtures/deploy_with_boot_strategy.yml @@ -13,5 +13,5 @@ registry: password: pw boot: - group_limit: 3 - group_wait: 2 + limit: 3 + wait: 2 diff --git a/test/fixtures/deploy_with_precentage_group_strategy.yml b/test/fixtures/deploy_with_precentage_boot_strategy.yml similarity index 82% rename from test/fixtures/deploy_with_precentage_group_strategy.yml rename to test/fixtures/deploy_with_precentage_boot_strategy.yml index e738d07d..eb68a52f 100644 --- a/test/fixtures/deploy_with_precentage_group_strategy.yml +++ b/test/fixtures/deploy_with_precentage_boot_strategy.yml @@ -13,5 +13,5 @@ registry: password: pw boot: - group_limit: 25% - group_wait: 2 + limit: 25% + wait: 2