From 56ff4c5cfdfc7fd84a10bd1d69418109e25c2560 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Sep 2022 17:13:46 +0200 Subject: [PATCH] Prod-like telemetry into staging --- .../config/grafana-datasource-prometheus.yaml | 7 + .../app/telemetry/config/prometheus.yml | 30 +++ .../app/telemetry/deploy/telemetry-system.hcl | 215 ++++-------------- .../app/telemetry/deploy/telemetry.hcl | 72 +++--- .../secrets/telemetry/grafana/admin_password | 1 + .../secrets/telemetry/grafana/s3_access_key | 1 + .../secrets/telemetry/grafana/s3_secret_key | 1 + .../config/apm-config.yaml | 0 .../config/filebeat.yml | 0 .../config/grafana-litestream.yml | 10 + .../provisioning/datasources/elastic.yaml | 0 .../config/otel-config.yaml | 0 .../deploy/telemetry-system.hcl | 182 +++++++++++++++ .../deploy/telemetry.hcl | 181 +++++++++++++++ 14 files changed, 494 insertions(+), 206 deletions(-) create mode 100644 cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml create mode 100644 cluster/staging/app/telemetry/config/prometheus.yml create mode 100644 cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password create mode 100644 cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key create mode 100644 cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key rename {cluster/staging/app/telemetry => experimental/bad.telemetry-elastic}/config/apm-config.yaml (100%) rename {cluster/staging/app/telemetry => experimental/bad.telemetry-elastic}/config/filebeat.yml (100%) create mode 100644 experimental/bad.telemetry-elastic/config/grafana-litestream.yml rename {cluster/staging/app/telemetry => experimental/bad.telemetry-elastic}/config/grafana/provisioning/datasources/elastic.yaml (100%) rename {cluster/staging/app/telemetry => experimental/bad.telemetry-elastic}/config/otel-config.yaml (100%) create mode 100644 experimental/bad.telemetry-elastic/deploy/telemetry-system.hcl create mode 100644 experimental/bad.telemetry-elastic/deploy/telemetry.hcl diff --git a/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml new file mode 100644 index 0000000..36b67e6 --- /dev/null +++ b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus.service.staging.consul:9090 diff --git a/cluster/staging/app/telemetry/config/prometheus.yml b/cluster/staging/app/telemetry/config/prometheus.yml new file mode 100644 index 0000000..e0e786d --- /dev/null +++ b/cluster/staging/app/telemetry/config/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'node-exporter' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key + + - job_name: 'garage' + authorization: + type: Bearer + credentials: {{ key "secrets/garage-staging/metrics_token" }} + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'garage-staging-admin' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key diff --git a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl index 3e26c2e..e2bad61 100644 --- a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl +++ b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl @@ -1,182 +1,49 @@ job "telemetry-system" { - datacenters = ["neptune"] - type = "system" + datacenters = ["neptune"] + type = "system" + priority = "100" - group "elasticsearch" { + group "collector" { network { - port "elastic" { - static = 9200 - } - port "elastic_internal" { - static = 9300 - } + port "node_exporter" { static = 9100 } } - task "elastic" { - driver = "docker" - config { - image = "docker.elastic.co/elasticsearch/elasticsearch:8.2.0" - network_mode = "host" - volumes = [ - "/mnt/ssd/telemetry/es_data:/usr/share/elasticsearch/data", - "secrets/elastic-certificates.p12:/usr/share/elasticsearch/config/elastic-certificates.p12", - ] - ports = [ "elastic", "elastic_internal" ] - sysctl = { - #"vm.max_map_count" = "262144", - } - ulimit = { - memlock = "9223372036854775807:9223372036854775807", + task "node_exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } } } - - user = "1000" - - resources { - memory = 1500 - cpu = 500 - } - - template { - data = "{{ key \"secrets/telemetry/elasticsearch/elastic-certificates.p12\" }}" - destination = "secrets/elastic-certificates.p12" - } - - template { - data = <