From 6c6af5465523dce42990e06b1ee1888de7bd4f68 Mon Sep 17 00:00:00 2001 From: Maximilien Richer Date: Sat, 25 Jan 2025 18:50:55 +0100 Subject: [PATCH] Switch back staging telemetry to docker and update stack --- .../telemetry/deploy/telemetry-service.hcl | 158 ++++-------------- .../telemetry/deploy/telemetry-storage.hcl | 97 +++++++++++ .../app/telemetry/deploy/telemetry-system.hcl | 75 +++++---- 3 files changed, 171 insertions(+), 159 deletions(-) create mode 100644 cluster/staging/app/telemetry/deploy/telemetry-storage.hcl diff --git a/cluster/staging/app/telemetry/deploy/telemetry-service.hcl b/cluster/staging/app/telemetry/deploy/telemetry-service.hcl index 47554e22..4bc9f8ac 100644 --- a/cluster/staging/app/telemetry/deploy/telemetry-service.hcl +++ b/cluster/staging/app/telemetry/deploy/telemetry-service.hcl @@ -2,95 +2,6 @@ job "telemetry-service" { datacenters = ["neptune", "dathomir", "corrin", "bespin"] type = "service" - group "prometheus" { - count = 2 - - network { - port "prometheus" { - static = 9090 - } - } - - constraint { - attribute = "${attr.unique.hostname}" - operator = "set_contains_any" - value = "df-pw5,origan" - } - - task "prometheus" { - driver = "nix2" - config { - nixpkgs = "github:nixos/nixpkgs/nixos-22.11" - packages = [ "#prometheus", "#coreutils", "#findutils", "#bash" ] - command = "prometheus" - args = [ - "--config.file=/etc/prom/prometheus.yml", - "--storage.tsdb.path=/data", - "--storage.tsdb.retention.size=5GB", - ] - bind = { - "/mnt/ssd/prometheus" = "/data" - } - } - - template { - data = file("../config/prometheus.yml") - destination = "etc/prom/prometheus.yml" - } - - template { - data = "{{ key \"secrets/consul/consul-ca.crt\" }}" - destination = "etc/prom/consul.crt" - } - - template { - data = "{{ key \"secrets/consul/consul-client.crt\" }}" - destination = "etc/prom/consul-client.crt" - } - - template { - data = "{{ key \"secrets/consul/consul-client.key\" }}" - destination = "etc/prom/consul-client.key" - } - - template { - data = "{{ key \"secrets/nomad/nomad-ca.crt\" }}" - destination = "etc/prom/nomad-ca.crt" - } - - template { - data = "{{ key \"secrets/nomad/nomad-client.crt\" }}" - destination = "etc/prom/nomad-client.crt" - } - - template { - data = "{{ key \"secrets/nomad/nomad-client.key\" }}" - destination = "etc/prom/nomad-client.key" - } - - resources { - memory = 500 - cpu = 200 - } - - service { - port = "prometheus" - name = "prometheus" - check { - type = "http" - path = "/" - interval = "60s" - timeout = "5s" - check_restart { - limit = 3 - grace = "90s" - ignore_warnings = false - } - } - } - } - } - group "grafana" { count = 1 @@ -106,50 +17,46 @@ job "telemetry-service" { sidecar = false } - driver = "nix2" + driver = "docker" config { - packages = [ "#litestream" ] - command = "litestream" + image = "litestream/litestream:0.3.13" args = [ "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db" ] - bind = { - "../alloc/data" = "/ephemeral", - } + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] } + user = "472" template { data = file("../config/grafana-litestream.yml") - destination = "etc/litestream.yml" + destination = "secrets/litestream.yml" } resources { - memory = 100 - memory_max = 1000 + memory = 50 + memory_max = 200 cpu = 100 } } task "grafana" { - driver = "nix2" + driver = "docker" config { - nixpkgs = "github:nixos/nixpkgs/nixos-22.11" - packages = [ "#grafana" ] - command = "grafana-server" - args = [ - "-homepath", "/share/grafana", - "cfg:default.paths.data=/grafana", - "cfg:default.paths.provisioning=/grafana-provisioning" + image = "grafana/grafana:11.4.0" + network_mode = "host" + ports = [ "grafana" ] + volumes = [ + "../alloc/data:/var/lib/grafana", + "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml" ] - - bind = { - "../alloc/data" = "/grafana", - } } template { data = file("../config/grafana-datasource-prometheus.yaml") - destination = "grafana-provisioning/datasources/prometheus.yaml" + destination = "secrets/prometheus.yaml" } template { @@ -163,8 +70,9 @@ GF_SECURITY_ADMIN_PASSWORD={{ key "secrets/telemetry/grafana/admin_password" }} } resources { - memory = 300 - cpu = 300 + memory = 100 + memory_max = 400 + cpu = 300 } restart { @@ -181,9 +89,12 @@ GF_SECURITY_ADMIN_PASSWORD={{ key "secrets/telemetry/grafana/admin_password" }} "tricot grafana.staging.deuxfleurs.org", "d53-cname grafana.staging.deuxfleurs.org", ] - port = "grafana" + port = 3719 + address_mode = "driver" check { type = "tcp" + port = 3719 + address_mode = "driver" interval = "60s" timeout = "5s" check_restart { @@ -196,26 +107,27 @@ GF_SECURITY_ADMIN_PASSWORD={{ key "secrets/telemetry/grafana/admin_password" }} } task "replicate-db" { - driver = "nix2" + driver = "docker" config { - packages = [ "#litestream" ] - command = "litestream" + image = "litestream/litestream:0.3.13" args = [ "replicate", "-config", "/etc/litestream.yml" ] - bind = { - "../alloc/data" = "/ephemeral", - } + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] } + user = "472" template { data = file("../config/grafana-litestream.yml") - destination = "etc/litestream.yml" + destination = "secrets/litestream.yml" } resources { - memory = 100 - memory_max = 500 + memory = 50 + memory_max = 200 cpu = 100 } } diff --git a/cluster/staging/app/telemetry/deploy/telemetry-storage.hcl b/cluster/staging/app/telemetry/deploy/telemetry-storage.hcl new file mode 100644 index 00000000..fbde6973 --- /dev/null +++ b/cluster/staging/app/telemetry/deploy/telemetry-storage.hcl @@ -0,0 +1,97 @@ +job "telemetry-storage" { + datacenters = ["neptune", "dathomir", "corrin", "bespin"] + type = "service" + + group "prometheus" { + count = 2 + + network { + port "prometheus" { + static = 9090 + } + } + + constraint { + attribute = "${attr.unique.hostname}" + operator = "set_contains_any" + value = "df-pw5,origan" + } + + task "prometheus" { + driver = "docker" + config { + image = "prom/prometheus:v3.1.0" + network_mode = "host" + ports = [ "prometheus" ] + args = [ + "--config.file=/etc/prometheus/prometheus.yml", + "--storage.tsdb.path=/data", + "--storage.tsdb.retention.size=20GB", + ] + volumes = [ + "secrets:/etc/prometheus", + "/mnt/ssd/prometheus:/data" + ] + } + + template { + data = file("../config/prometheus.yml") + destination = "secrets/prometheus.yml" + } + + template { + data = "{{ key \"secrets/consul/consul-ca.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" + } + + template { + data = "{{ key \"secrets/nomad/nomad-ca.crt\" }}" + destination = "secrets/nomad-ca.crt" + } + + template { + data = "{{ key \"secrets/nomad/nomad-client.crt\" }}" + destination = "secrets/nomad-client.crt" + } + + template { + data = "{{ key \"secrets/nomad/nomad-client.key\" }}" + destination = "secrets/nomad-client.key" + } + + resources { + memory = 500 + cpu = 200 + } + + service { + port = 9090 + address_mode = "driver" + name = "prometheus" + check { + type = "http" + path = "/" + port = 9090 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } +} diff --git a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl index a97c7b10..9cd254a9 100644 --- a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl +++ b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl @@ -4,43 +4,46 @@ job "telemetry-system" { priority = "100" group "collector" { - network { - port "node_exporter" { static = 9100 } - } + network { + port "node_exporter" { static = 9100 } + } - task "node_exporter" { - driver = "nix2" + task "node_exporter" { + driver = "docker" - config { - packages = [ "#prometheus-node-exporter" ] - command = "node_exporter" - args = [ "--path.rootfs=/host" ] - bind_read_only = { - "/" = "/host" - } - } + config { + image = "quay.io/prometheus/node-exporter:v1.8.1" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } - resources { - cpu = 50 - memory = 40 - } + resources { + cpu = 50 + memory = 40 + } - service { - name = "node-exporter" - tags = [ "telemetry" ] - port = "node_exporter" - check { - type = "http" - path = "/" - interval = "60s" - timeout = "5s" - check_restart { - limit = 3 - grace = "90s" - ignore_warnings = false - } - } - } - } - } -} + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } + }