From ecb4cabcf0ea52226d95f1e0e0f2f5d1695133a5 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 27 Aug 2023 13:56:51 +0200 Subject: [PATCH] prod garage: add health check using admin api's '/health' --- cluster/prod/app/garage/deploy/garage.hcl | 113 ++++++++++++++-------- 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/cluster/prod/app/garage/deploy/garage.hcl b/cluster/prod/app/garage/deploy/garage.hcl index 26f76de..7ed963c 100644 --- a/cluster/prod/app/garage/deploy/garage.hcl +++ b/cluster/prod/app/garage/deploy/garage.hcl @@ -14,7 +14,7 @@ job "garage" { port "rpc" { static = 3901 } port "web" { static = 3902 } port "admin" { static = 3903 } - port "k2v" { static = 3904 } + port "k2v" { static = 3904 } } update { @@ -26,7 +26,6 @@ job "garage" { task "server" { driver = "docker" config { - advertise_ipv6_address = true image = "dxflrs/garage:v0.8.2" command = "/garage" args = [ "server" ] @@ -70,20 +69,22 @@ job "garage" { kill_timeout = "20s" + restart { + interval = "30m" + attempts = 10 + delay = "15s" + mode = "delay" + } + + #### Configuration for service ports: admin port (internal use only) + service { - tags = [ - "garage_api", - "tricot garage.deuxfleurs.fr", - "tricot *.garage.deuxfleurs.fr", - "tricot-site-lb", - ] - port = 3900 - address_mode = "driver" - name = "garage-api" + port = "admin" + address_mode = "host" + name = "garage-admin" + # Check that Garage is alive and answering TCP connections check { type = "tcp" - port = 3900 - address_mode = "driver" interval = "60s" timeout = "5s" check_restart { @@ -94,6 +95,41 @@ job "garage" { } } + #### Configuration for service ports: externally available ports (API, web) + + service { + tags = [ + "garage_api", + "tricot garage.deuxfleurs.fr", + "tricot *.garage.deuxfleurs.fr", + "tricot-site-lb", + ] + port = "s3" + address_mode = "host" + name = "garage-api" + # Check 1: Garage is alive and answering TCP connections + check { + name = "garage-api-live" + type = "tcp" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + # Check 2: Garage is in a healthy state and requests should be routed here + check { + name = "garage-api-healthy" + port = "admin" + type = "http" + path = "/health" + interval = "60s" + timeout = "5s" + } + } + service { tags = [ "garage-web", @@ -105,13 +141,13 @@ job "garage" { "tricot-add-header X-Content-Type-Options nosniff", "tricot-site-lb", ] - port = 3902 - address_mode = "driver" + port = "web" + address_mode = "host" name = "garage-web" + # Check 1: Garage is alive and answering TCP connections check { + name = "garage-web-live" type = "tcp" - port = 3902 - address_mode = "driver" interval = "60s" timeout = "5s" check_restart { @@ -120,23 +156,14 @@ job "garage" { ignore_warnings = false } } - } - - service { - port = 3903 - address_mode = "driver" - name = "garage-admin" + # Check 2: Garage is in a healthy state and requests should be routed here check { - type = "tcp" - port = 3903 - address_mode = "driver" + name = "garage-web-healthy" + port = "admin" + type = "http" + path = "/health" interval = "60s" timeout = "5s" - check_restart { - limit = 3 - grace = "90s" - ignore_warnings = false - } } } @@ -146,13 +173,13 @@ job "garage" { "tricot k2v.deuxfleurs.fr", "tricot-site-lb", ] - port = 3904 - address_mode = "driver" + port = "k2v" + address_mode = "host" name = "garage-k2v" + # Check 1: Garage is alive and answering TCP connections check { + name = "garage-k2v-live" type = "tcp" - port = 3904 - address_mode = "driver" interval = "60s" timeout = "5s" check_restart { @@ -161,13 +188,15 @@ job "garage" { ignore_warnings = false } } - } - - restart { - interval = "30m" - attempts = 10 - delay = "15s" - mode = "delay" + # Check 2: Garage is in a healthy state and requests should be routed here + check { + name = "garage-k2v-healthy" + port = "admin" + type = "http" + path = "/health" + interval = "60s" + timeout = "5s" + } } } }