Skip to content

Commit

Permalink
Merge pull request basecamp#219 from basecamp/docker-health-checks
Browse files Browse the repository at this point in the history
  • Loading branch information
dhh authored Apr 28, 2023
2 parents 2ad0dc0 + df202d6 commit 4fa6a6c
Show file tree
Hide file tree
Showing 13 changed files with 183 additions and 112 deletions.
30 changes: 27 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -677,9 +677,11 @@ That'll post a line like follows to a preconfigured chatbot in Basecamp:
[My App] [dhh] Rolled back to version d264c4e92470ad1bd18590f04466787262f605de
```

### Custom healthcheck
### Healthcheck

MRSK defaults to checking the health of your application again `/up` on port 3000 up to 7 times. You can tailor the behaviour with the `healthcheck` setting:
MRSK uses Docker healtchecks to check the health of your application during deployment. Traefik uses this same healthcheck status to determine when a container is ready to receive traffic.

The healthcheck defaults to testing the HTTP response to the path `/up` on port 3000, up to 7 times. You can tailor this behaviour with the `healthcheck` setting:

```yaml
healthcheck:
Expand All @@ -690,7 +692,29 @@ healthcheck:

This will ensure your application is configured with a traefik label for the healthcheck against `/healthz` and that the pre-deploy healthcheck that MRSK performs is done against the same path on port 4000.

The healthcheck also allows for an optional `max_attempts` setting, which will attempt the healthcheck up to the specified number of times before failing the deploy. This is useful for applications that take a while to start up. The default is 7.
You can also specify a custom healthcheck command, which is useful for non-HTTP services:

```yaml
healthcheck:
cmd: /bin/check_health
```

The top-level healthcheck configuration applies to all services that use
Traefik, by default. You can also specialize the configuration at the role
level:

```yaml
servers:
job:
hosts: ...
cmd: bin/jobs
healthcheck:
cmd: bin/check
```

The healthcheck allows for an optional `max_attempts` setting, which will attempt the healthcheck up to the specified number of times before failing the deploy. This is useful for applications that take a while to start up. The default is 7.

Note that the HTTP health checks assume that the `curl` command is avilable inside the container. If that's not the case, use the healthcheck's `cmd` option to specify an alternative check that the container supports.

## Commands

Expand Down
23 changes: 13 additions & 10 deletions lib/mrsk/cli/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ def boot
using_version(version_or_latest) do |version|
say "Start container with version #{version} using a #{MRSK.config.readiness_delay}s readiness delay (or reboot if already running)...", :magenta

cli = self

on(MRSK.hosts) do
execute *MRSK.auditor.record("Tagging #{MRSK.config.absolute_image} as the latest image"), verbosity: :debug
execute *MRSK.app.tag_current_as_latest
Expand All @@ -17,19 +15,24 @@ def boot
roles = MRSK.roles_on(host)

roles.each do |role|
execute *MRSK.auditor(role: role).record("Booted app version #{version}"), verbosity: :debug
app = MRSK.app(role: role)
auditor = MRSK.auditor(role: role)

execute *auditor.record("Booted app version #{version}"), verbosity: :debug

if capture_with_info(*MRSK.app(role: role).container_id_for_version(version), raise_on_non_zero_exit: false).present?
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
tmp_version = "#{version}_#{SecureRandom.hex(8)}"
info "Renaming container #{version} to #{tmp_version} as already deployed on #{host}"
execute *MRSK.auditor(role: role).record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
execute *MRSK.app(role: role).rename_container(version: version, new_version: tmp_version)
execute *auditor.record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
execute *app.rename_container(version: version, new_version: tmp_version)
end

old_version = capture_with_info(*MRSK.app(role: role).current_running_version, raise_on_non_zero_exit: false).strip
execute *MRSK.app(role: role).run
sleep MRSK.config.readiness_delay
execute *MRSK.app(role: role).stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
old_version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
execute *app.run

Mrsk::Utils::HealthcheckPoller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }

execute *app.stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
end
end
end
Expand Down
37 changes: 3 additions & 34 deletions lib/mrsk/cli/healthcheck.rb
Original file line number Diff line number Diff line change
@@ -1,46 +1,15 @@
class Mrsk::Cli::Healthcheck < Mrsk::Cli::Base

class HealthcheckError < StandardError; end

default_command :perform

desc "perform", "Health check current app version"
def perform
on(MRSK.primary_host) do
begin
execute *MRSK.healthcheck.run

target = "Health check against #{MRSK.config.healthcheck["path"]}"
attempt = 1
max_attempts = MRSK.config.healthcheck["max_attempts"]

begin
status = capture_with_info(*MRSK.healthcheck.curl)

if status == "200"
info "#{target} succeeded with 200 OK!"
else
raise HealthcheckError, "#{target} failed with status #{status}"
end
rescue SSHKit::Command::Failed
if attempt <= max_attempts
info "#{target} failed to respond, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1

retry
else
raise
end
end
rescue SSHKit::Command::Failed, HealthcheckError => e
Mrsk::Utils::HealthcheckPoller.wait_for_healthy { capture_with_info(*MRSK.healthcheck.status) }
rescue Mrsk::Utils::HealthcheckPoller::HealthcheckError => e
error capture_with_info(*MRSK.healthcheck.logs)

if e.message =~ /curl/
raise SSHKit::Command::Failed, "#{target} failed to return 200 OK!"
else
raise
end
raise
ensure
execute *MRSK.healthcheck.stop, raise_on_non_zero_exit: false
execute *MRSK.healthcheck.remove, raise_on_non_zero_exit: false
Expand Down
5 changes: 5 additions & 0 deletions lib/mrsk/commands/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def run
"--name", container_name,
"-e", "MRSK_CONTAINER_NAME=\"#{container_name}\"",
*role.env_args,
*role.health_check_args,
*config.logging_args,
*config.volume_args,
*role.label_args,
Expand All @@ -27,6 +28,10 @@ def start
docker :start, container_name
end

def status(version:)
pipe container_id_for_version(version), xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT))
end

def stop(version: nil)
pipe \
version ? container_id_for_version(version) : current_running_container_id,
Expand Down
2 changes: 2 additions & 0 deletions lib/mrsk/commands/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ module Mrsk::Commands
class Base
delegate :sensitive, :argumentize, to: Mrsk::Utils

DOCKER_HEALTH_STATUS_FORMAT = "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'"

attr_accessor :config

def initialize(config)
Expand Down
5 changes: 3 additions & 2 deletions lib/mrsk/commands/healthcheck.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@ def run
"--label", "service=#{container_name}",
"-e", "MRSK_CONTAINER_NAME=\"#{container_name}\"",
*web.env_args,
*web.health_check_args,
*config.volume_args,
*web.option_args,
config.absolute_image,
web.cmd
end

def curl
[ :curl, "--silent", "--output", "/dev/null", "--write-out", "'%{http_code}'", "--max-time", "2", health_url ]
def status
pipe container_id, xargs(docker(:inspect, "--format", DOCKER_HEALTH_STATUS_FORMAT))
end

def logs
Expand Down
21 changes: 19 additions & 2 deletions lib/mrsk/configuration/role.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ def env_args
argumentize_env_with_secrets env
end

def health_check_args
if health_check_cmd.present?
optionize({ "health-cmd" => health_check_cmd, "health-interval" => "1s" })
else
[]
end
end

def health_check_cmd
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?

options["cmd"] || http_health_check(port: options["port"], path: options["path"])
end

def cmd
specializations["cmd"]
end
Expand Down Expand Up @@ -75,8 +90,6 @@ def traefik_labels
if running_traefik?
{
"traefik.http.routers.#{traefik_service}.rule" => "PathPrefix(`/`)",
"traefik.http.services.#{traefik_service}.loadbalancer.healthcheck.path" => config.healthcheck["path"],
"traefik.http.services.#{traefik_service}.loadbalancer.healthcheck.interval" => "1s",
"traefik.http.middlewares.#{traefik_service}-retry.retry.attempts" => "5",
"traefik.http.middlewares.#{traefik_service}-retry.retry.initialinterval" => "500ms",
"traefik.http.routers.#{traefik_service}.middlewares" => "#{traefik_service}-retry@docker"
Expand Down Expand Up @@ -125,4 +138,8 @@ def merged_env_with_secrets
new_env["clear"] = (clear_app_env + clear_role_env).uniq
end
end

def http_health_check(port:, path:)
"curl -f #{URI.join("http://localhost:#{port}", path)} || exit 1" if path.present? || port.present?
end
end
39 changes: 39 additions & 0 deletions lib/mrsk/utils/healthcheck_poller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
class Mrsk::Utils::HealthcheckPoller
TRAEFIK_HEALTHY_DELAY = 1

class HealthcheckError < StandardError; end

class << self
def wait_for_healthy(pause_after_ready: false, &block)
attempt = 1
max_attempts = MRSK.config.healthcheck["max_attempts"]

begin
case status = block.call
when "healthy"
sleep TRAEFIK_HEALTHY_DELAY if pause_after_ready
when "running" # No health check configured
sleep MRSK.config.readiness_delay if pause_after_ready
else
raise HealthcheckError, "container not ready (#{status})"
end
rescue HealthcheckError => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1
retry
else
raise
end
end

info "Container is healthy!"
end

private
def info(message)
SSHKit.config.output.info(message)
end
end
end
11 changes: 9 additions & 2 deletions test/cli/app_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

class CliAppTest < CliTestCase
test "boot" do
# Stub current version fetch
SSHKit::Backend::Abstract.any_instance.stubs(:capture).returns("123") # old version
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running") # health check

run_command("boot").tap do |output|
assert_match "docker tag dhh/app:latest dhh/app:latest", output
Expand All @@ -19,6 +22,10 @@ class CliAppTest < CliTestCase
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.returns("12345678") # running version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running") # health check

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--latest", "--format", "\"{{.Names}}\"", "|", "grep -oE \"\\-[^-]+$\"", "|", "cut -c 2-", raise_on_non_zero_exit: false)
.returns("123") # old version
Expand Down
54 changes: 25 additions & 29 deletions test/cli/healthcheck_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,62 +5,58 @@ class CliHealthcheckTest < CliTestCase
# Prevent expected failures from outputting to terminal
Thread.report_on_exception = false

SSHKit::Backend::Abstract.any_instance.stubs(:sleep) # No sleeping when retrying
Mrsk::Utils::HealthcheckPoller.stubs(:sleep) # No sleeping when retrying

SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "MRSK_CONTAINER_NAME=\"healthcheck-app\"", "dhh/app:999")
.with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "MRSK_CONTAINER_NAME=\"healthcheck-app\"", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999")
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false)

# Fail twice to test retry logic
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
.with(:curl, "--silent", "--output", "/dev/null", "--write-out", "'%{http_code}'", "--max-time", "2", "http://localhost:3999/up")
.raises(SSHKit::Command::Failed)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("starting")
.then
.raises(SSHKit::Command::Failed)
.returns("unhealthy")
.then
.returns("200")
.returns("healthy")

run_command("perform").tap do |output|
assert_match "Health check against /up failed to respond, retrying in 1s (attempt 1/7)...", output
assert_match "Health check against /up failed to respond, retrying in 2s (attempt 2/7)...", output
assert_match "Health check against /up succeeded with 200 OK!", output
assert_match "container not ready (starting), retrying in 1s (attempt 1/7)...", output
assert_match "container not ready (unhealthy), retrying in 2s (attempt 2/7)...", output
assert_match "Container is healthy!", output
end
end

test "perform failing because of curl" do
test "perform failing to become healthy" do
# Prevent expected failures from outputting to terminal
Thread.report_on_exception = false

SSHKit::Backend::Abstract.any_instance.stubs(:execute) # No need to execute anything here
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
.with(:curl, "--silent", "--output", "/dev/null", "--write-out", "'%{http_code}'", "--max-time", "2", "http://localhost:3999/up")
.returns("curl: command not found")
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :logs, "--tail", 50, "2>&1")

exception = assert_raises SSHKit::Runner::ExecuteError do
run_command("perform")
end
assert_match "Health check against /up failed to return 200 OK!", exception.message
end
Mrsk::Utils::HealthcheckPoller.stubs(:sleep) # No sleeping when retrying

test "perform failing for unknown reason" do
# Prevent expected failures from outputting to terminal
Thread.report_on_exception = false
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :run, "--detach", "--name", "healthcheck-app-999", "--publish", "3999:3000", "--label", "service=healthcheck-app", "-e", "MRSK_CONTAINER_NAME=\"healthcheck-app\"", "--health-cmd", "\"curl -f http://localhost:3000/up || exit 1\"", "--health-interval", "\"1s\"", "dhh/app:999")
SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :container, :rm, raise_on_non_zero_exit: false)

SSHKit::Backend::Abstract.any_instance.stubs(:execute) # No need to execute anything here
# Continually report unhealthy
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
.with(:curl, "--silent", "--output", "/dev/null", "--write-out", "'%{http_code}'", "--max-time", "2", "http://localhost:3999/up")
.returns("500")
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy")

# Capture logs when failing
SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :logs, "--tail", 50, "2>&1")
.returns("some log output")

exception = assert_raises do
run_command("perform")
end
assert_match "Health check against /up failed with status 500", exception.message
assert_match "container not ready (unhealthy)", exception.message
end

private
Expand Down
Loading

0 comments on commit 4fa6a6c

Please sign in to comment.