Compare commits

...

21 commits

Author SHA1 Message Date
8418c40250
Forgejo template 2025-04-07 21:58:31 +02:00
Baptiste Jonglez
fe68fdf54a plume: increase memory again 2025-03-26 20:21:57 +01:00
Baptiste Jonglez
187d36eb9b deploy_nixos: add help to apply changes without rebooting in production 2025-03-26 00:17:59 +01:00
Baptiste Jonglez
fd6275f5bc prod: Fix vim configuration syntax (different between staging and prod due to NixOS version difference) 2025-03-26 00:17:08 +01:00
Baptiste Jonglez
fc88a063b1 node_exporter: avoid using network mode host 2025-03-25 22:21:35 +01:00
Baptiste Jonglez
bb8c9db2ed telemetry: avoid network mode host, and poll less often 2025-03-25 22:12:42 +01:00
451068d716 Merge pull request 'prod: telemetry: Add smartctl_exporter based on staging work' (#53) from prod_smartctl_monitoring into main
Reviewed-on: #53
2025-03-25 21:09:08 +00:00
Baptiste Jonglez
797f946578 prod: telemetry: Add smartctl_exporter based on staging work 2025-03-24 17:53:17 +01:00
Baptiste Jonglez
596b7ab966 prod: telemetry: rename node-exporter job 2025-03-24 17:51:55 +01:00
Baptiste Jonglez
ec1fa3e540 staging: telemetry: Use a init task to create fake disk devices for smartctl_exporter 2025-03-24 17:47:05 +01:00
67230dd60c
guichet now advertise the correct dxfl login command 2025-03-24 16:48:18 +01:00
305c160899
guichet upgrade 2025-03-21 00:27:05 +01:00
Baptiste Jonglez
8d9aa00de5 staging: harden config of smartctl exporter
It currently requires all nodes to have /dev/sda (the device passthrough is hardcoded for now)
2025-03-19 23:46:55 +01:00
Baptiste Jonglez
5790453ff1 nix: Allow all capabilities in Nomad
This will be necessary for the smartctl exporter since it needs Linux
capabilities that are not allowed by default in Nomad.

We only have trusted Nomad jobs, and we already allow privileged
containers anyway, so there is no security impact.
2025-03-19 23:39:04 +01:00
Baptiste Jonglez
a2a470ac3d staging: promote piranha to Nomad server (caribou is dead) 2025-03-19 23:08:49 +01:00
Baptiste Jonglez
2009572fea prod: telemetry: move storage from bespin/scorpio to bespin/corrin 2025-03-12 21:22:56 +01:00
Baptiste Jonglez
8f0a45f03e staging: telemetry: add smartctl exporter 2025-03-12 21:06:56 +01:00
Baptiste Jonglez
b98e72af96 staging: telemetry: Fix metric collection due to faulty Consul connection 2025-03-12 20:51:49 +01:00
Baptiste Jonglez
e805cf5cf6 Augmentation stockage prometheus
La limite actuelle correspond à environ 2 mois d'historique prometheus,
c'est parfois trop peu pour pouvoir relever des tendances sur le long terme.
2025-03-11 23:10:07 +01:00
6b52ccd374 Merge pull request 'upgrade garage to v1.99.1' (#49) from garage-1.99 into main
Reviewed-on: #49
2025-03-09 09:48:50 +00:00
Armaël Guéneau
c5a0577cbf upgrade garage to v1.99.1 2025-03-09 10:44:12 +01:00
23 changed files with 909 additions and 136 deletions

View file

@ -3,11 +3,6 @@ job "garage" {
type = "system"
priority = 80
update {
max_parallel = 2
min_healthy_time = "60s"
}
group "garage" {
network {
port "s3" { static = 3900 }
@ -18,7 +13,11 @@ job "garage" {
}
update {
max_parallel = 10
# When upgrading the service, stop and upgrade nodes one by one.
# This should allow performing minor upgrades without downtime.
# (A higher value for max_parallel would risk stopping nodes in different
# zones at the same time, which would make the cluster unavailable)
max_parallel = 1
min_healthy_time = "30s"
healthy_deadline = "5m"
}
@ -26,7 +25,7 @@ job "garage" {
task "server" {
driver = "docker"
config {
image = "superboum/garage:v1.0.0-rc1-hotfix-red-ftr-wquorum"
image = "dxflrs/garage:v1.99.1-internal"
command = "/garage"
args = [ "server" ]
network_mode = "host"

View file

@ -13,7 +13,7 @@ job "guichet" {
task "guichet" {
driver = "docker"
config {
image = "dxflrs/guichet:0x4y7bj1qb8w8hckvpbzlgyxh63j66ij"
image = "dxflrs/guichet:aqd78kjs4dmizm8gax67d8sd68l9gcf5"
args = [ "server", "-config", "/etc/config.json" ]
readonly_rootfs = true
ports = [ "web_port" ]

View file

@ -17,8 +17,8 @@ job "plume-blog" {
network_mode = "host"
ports = [ "cache_port" ]
# cache
mount {
# cache
mount {
type = "tmpfs"
target = "/var/lib/varnish/varnishd:exec"
readonly = false
@ -64,8 +64,8 @@ job "plume-blog" {
}
resources {
memory = 1024
memory_max = 1024
memory = 1500
memory_max = 1500
cpu = 100
}

View file

@ -16,6 +16,17 @@ scrape_configs:
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key
- job_name: 'smartctl-exporter'
scrape_interval: 300s
consul_sd_configs:
- server: 'https://localhost:8501'
services:
- 'smartctl-exporter'
tls_config:
ca_file: /etc/prometheus/consul-ca.crt
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key
- job_name: 'tricot'
consul_sd_configs:
- server: 'https://localhost:8501'

View file

@ -0,0 +1,170 @@
{
"defaultAction": "SCMP_ACT_ERRNO",
"defaultErrnoRet": 1,
"architectures": [
"SCMP_ARCH_X86_64"
],
"syscalls": [
{
"names": [
"rt_sigaction",
"rt_sigprocmask",
"getpid",
"fcntl",
"fstatfs",
"gettid",
"futex",
"getdents64",
"epoll_ctl",
"tgkill",
"openat",
"read",
"close",
"nanosleep",
"getsockname",
"setsockopt",
"chdir",
"capget",
"prctl",
"accept4",
"fstat",
"getcwd",
"setuid",
"setgid",
"setgroups",
"capset",
"newfstatat",
"write",
"writev",
"mmap",
"brk",
"rt_sigreturn",
"access",
"execve",
"getppid",
"exit_group",
"faccessat2",
"mprotect",
"pread64",
"arch_prctl",
"set_tid_address",
"set_robust_list",
"rseq",
"munmap",
"madvise",
"sigaltstack",
"statfs",
"waitid",
"readlinkat",
"eventfd2",
"epoll_create1",
"pipe2",
"pidfd_send_signal",
"pidfd_open",
"readlink",
"epoll_pwait",
"dup3",
"bind",
"listen",
"getrliimt",
"sched_getaffinity",
"sched_yield"
],
"action": "SCMP_ACT_ALLOW",
"comment": "globally needed by the go runtime"
},
{
"names": [
"open",
"uname"
],
"action": "SCMP_ACT_ALLOW",
"comment": "Used by smartctl"
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow SG_IO (aka SCSCI commands) on ioctl as it's what's used to read SMART data",
"args": [
{
"index": 1,
"value": 8837,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow NVME_IOCTL_ID command (0x4e40) on ioctl as it's what's used to read data on NVMe devices",
"args": [
{
"index": 1,
"value": 20032,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow NVME_IOCTL_ADMIN_CMD command (0xc0484e41) on ioctl as it's what's used to read data on NVMe devices. For some reason, it needs to be encoded as 0xffffffffc0484e41",
"args": [
{
"index": 1,
"value": 18446744072640548417,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ERRNO",
"comment": "Debug to allow/deny all ioctl (change to _LOG, _ALLOW, or _ERRNO appropriately)"
},
{
"names": [
"clone"
],
"action": "SCMP_ACT_ALLOW",
"comment": "partially allow clone as per docker config",
"args": [
{
"index": 0,
"value": 2114060288,
"op": "SCMP_CMP_MASKED_EQ"
}
]
},
{
"names": [
"clone3"
],
"action": "SCMP_ACT_ERRNO",
"comment": "disable clone3 in a specific way as per docker's default config",
"errnoRet": 38
},
{
"names": [
"socket"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow IPv4 sockets",
"args": [
{
"index": 0,
"value": 2,
"op": "SCMP_CMP_EQ"
}
]
}
]
}

View file

@ -0,0 +1,51 @@
job "telemetry-node-exporter" {
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
type = "system"
priority = "100"
group "node_exporter" {
network {
port "node_exporter" { static = 9100 }
}
task "node_exporter" {
driver = "docker"
config {
image = "quay.io/prometheus/node-exporter:v1.8.1"
ports = ["node_exporter"]
volumes = [
"/:/host:ro,rslave"
]
args = [
"--web.listen-address=0.0.0.0:${NOMAD_PORT_node_exporter}",
"--path.rootfs=/host"
]
}
resources {
cpu = 50
memory = 40
}
service {
tags = [ "telemetry" ]
port = "node_exporter"
name = "node-exporter"
check {
type = "http"
path = "/"
port = 9100
address_mode = "driver"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -0,0 +1,131 @@
job "telemetry-smartctl-exporter" {
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
type = "system"
priority = "100"
group "smartctl_exporter" {
network {
port "smartctl_exporter" { static = 9101 }
}
# This init task creates "fake" disk devices. This way, we can
# restrict which devices we expose to smartctl_exporter while having
# the same task configuration on all hosts.
task "create_fake_disks" {
driver = "docker"
user = "root"
config {
image = "bash:5.2.37"
args = [
"-x", "${NOMAD_TASK_DIR}/create_fake_disks.sh"
]
readonly_rootfs = true
mount {
type = "bind"
target = "/dev"
source = "/dev"
readonly = false
}
}
template {
data = <<EOF
echo "Checking existing disks and creating fake devices if needed..."
[ -a "/dev/sda" ] || ln -s /dev/null /dev/sda
[ -a "/dev/sdb" ] || ln -s /dev/null /dev/sdb
[ -a "/dev/nvme0" ] || ln -s /dev/null /dev/nvme0
EOF
destination = "local/create_fake_disks.sh"
perms = 755
}
resources {
cpu = 10
memory = 10
}
lifecycle {
hook = "prestart"
sidecar = false
}
}
task "smartctl_exporter" {
driver = "docker"
# Necessary to use low-level SMART and NVMe commands
user = "root"
config {
image = "prometheuscommunity/smartctl-exporter:v0.13.0"
args = [
"--web.listen-address=0.0.0.0:${NOMAD_PORT_smartctl_exporter}"
]
ports = ["smartctl_exporter"]
# CAP_SYS_RAWIO is needed for SMART requests, while CAP_SYS_ADMIN
# is needed for NVMe requests.
# These capabilities need to be allowed in the Nomad client config.
cap_drop = ["all"]
cap_add = ["CAP_SYS_RAWIO", "CAP_SYS_ADMIN"]
# Hardening options to avoid running the container as privileged,
# while still allowing just enough syscalls so that smartctl can query the disks.
security_opt = [
"no-new-privileges",
# Apparently there is no variable to determine the path to the allocation, hence this hack
"seccomp=/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}/local/smartctl-seccomp.json",
]
readonly_rootfs = true
# Sadly, devices must exist for Docker to accept this option, otherwise it fails to run.
# This is why we create "fake" devices in the init task above.
devices = [
{
host_path = "/dev/sda"
container_path = "/dev/sda"
cgroup_permissions = "r"
},
{
host_path = "/dev/sdb"
container_path = "/dev/sdb"
cgroup_permissions = "r"
},
{
host_path = "/dev/nvme0"
container_path = "/dev/nvme0"
cgroup_permissions = "r"
}
]
}
template {
data = file("../config/smartctl-seccomp.json")
destination = "local/smartctl-seccomp.json"
perms = 444
}
resources {
cpu = 50
memory = 40
}
service {
tags = [ "telemetry" ]
port = "smartctl_exporter"
name = "smartctl-exporter"
check {
type = "http"
path = "/"
port = 9101
address_mode = "driver"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -1,5 +1,5 @@
job "telemetry-storage" {
datacenters = ["scorpio", "bespin"]
datacenters = ["scorpio", "bespin", "corrin"]
type = "service"
group "prometheus" {
@ -14,7 +14,7 @@ job "telemetry-storage" {
constraint {
attribute = "${attr.unique.hostname}"
operator = "set_contains_any"
value = "ananas,df-ymk"
value = "pamplemousse,df-ymk"
}
task "prometheus" {
@ -26,7 +26,7 @@ job "telemetry-storage" {
args = [
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/data",
"--storage.tsdb.retention.size=20GB",
"--storage.tsdb.retention.size=40GB",
]
volumes = [
"secrets:/etc/prometheus",
@ -72,7 +72,7 @@ job "telemetry-storage" {
resources {
memory = 1500
memory_max = 4000
cpu = 1000
cpu = 400
}
service {

View file

@ -1,49 +0,0 @@
job "telemetry-system" {
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
type = "system"
priority = "100"
group "collector" {
network {
port "node_exporter" { static = 9100 }
}
task "node_exporter" {
driver = "docker"
config {
image = "quay.io/prometheus/node-exporter:v1.8.1"
network_mode = "host"
volumes = [
"/:/host:ro,rslave"
]
args = [ "--path.rootfs=/host" ]
}
resources {
cpu = 50
memory = 40
}
service {
tags = [ "telemetry" ]
port = 9100
address_mode = "driver"
name = "node-exporter"
check {
type = "http"
path = "/"
port = 9100
address_mode = "driver"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -6,6 +6,8 @@
# The IP range to use for the Wireguard overlay of this cluster
deuxfleurs.clusterPrefix = "10.83.0.0/16";
programs.vim.defaultEditor = true;
deuxfleurs.clusterNodes = {
"df-ykl" = {
siteName = "bespin";

View file

@ -0,0 +1,137 @@
job "git" {
datacenters = ["bespin"]
type = "service"
group "forgejo" {
count = 1
network {
port "http" { static = 3000 }
port "ssh" { static = 22 }
}
ephemeral_disk {
size = 10000
}
restart {
attempts = 10
delay = "30s"
}
task "forgejo" {
driver = "docker"
config {
image = "codeberg.org/forgejo/forgejo:10.0.2"
network_mode = "host"
readonly_rootfs = true
ports = [ "http", "ssh" ]
volumes = [
"/ssd/forgejo:/data",
"/etc/timezone:/etc/timezone:ro",
"/etc/localtime:/etc/localtime:ro"
]
}
template {
data = <<ENV
USER_UID = 106
USER_GID = 112
DB_TYPE = postgres
DB_HOST = db:3306
GITEA__database__NAME = gitea
GITEA__database__USER = gitea
GITEA__database__PASSWD = "{{ key \"secrets/git/forgejo/database_password\" }}"
# Mailer credentials
GITEA__mailer__USER = _system._gitea@deuxfleurs.fr
GITEA__mailer__PASSWD = "{{ key \"secrets/git/forgejo/mailer_password\" }}"
# General configuration
GITEA__server__DOMAIN = git.staging.deuxfleurs.org
GITEA__server__SSH_DOMAIN = git.staging.deuxfleurs.org
GITEA__server__ROOT_URL = https://git.staging.deuxfleurs.org
GITEA__server__LFS_START_SERVER = true
GITEA__database__DB_TYPE = mysql
GITEA__database__HOST = db
GITEA__mailer__ENABLED = true
GITEA__mailer__SMTP_ADDR = smtp.deuxfleurs.fr
GITEA__mailer__SMTP_PORT = 465
GITEA__mailer__PROTOCOL = smtps
GITEA__mailer__FROM = Deuxfleurs Gitea <_system._forjego@staging.deuxfleurs.org>
GITEA__mailer__FORCE_TRUST_SERVER_CERT = true
GITEA__mailer__SUBJECT_PREFIX = [Deuxfleurs Forgejo Staging]
GITEA__service__REGISTER_EMAIL_CONFIRM = true
GITEA__service__ENABLE_NOTIFY_MAIL = true
GITEA__admin__DEFAULT_EMAIL_NOTIFICATIONS = enabled
GITEA__lfs__PATH = /data/git/lfs
# Prevent spam accounts
GITEA__service__DEFAULT_ALLOW_CREATE_ORGANIZATION = false
GITEA__service__DEFAULT_USER_VISIBILITY = limited
GITEA__service__DEFAULT_KEEP_EMAIL_PRIVATE = true
# Allow CORS for StaticCMS (a fork of Netlify CMS)
GITEA__cors__ENABLED = true
GITEA__cors__ALLOW_DOMAIN = *
GITEA__cors__ALLOW_CREDENTIALS = true
GITEA__cors__METHODS = GET,HEAD,POST,PUT,PATCH,DELETE,OPTIONS
GITEA__cors__SCHEME = *
GITEA__cors__HEADERS = Content-Type,User-Agent,Authorization
# Options passed to Gitea
# see https://docs.gitea.io/en-us/config-cheat-sheet/
# config is in /data/gitea/config/app.ini
GITEA__ui__ISSUE_PAGING_NUM = 20
ENV
destination = "secrets/env"
env = true
}
resources {
cpu = 1000
memory = 1000
memory_max = 1000
}
service {
name = "forgejo-ssh"
port = "ssh"
address_mode = "host"
tags = [
"forgejo-staging-ssh",
"tricot git.staging.deuxfleurs.org 100",
"d53-cname git.staging.deuxfleurs.org",
]
check {
type = "tcp"
port = "ssh"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
service {
name = "forgejo-http"
port = "http"
address_mode = "host"
tags = [
"forgejo-staging-http",
"tricot-add-header Access-Control-Allow-Origin *",
"d53-cname git.staging.deuxfleurs.org",
]
check {
type = "tcp"
port = "http"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -12,9 +12,20 @@ scrape_configs:
services:
- 'node-exporter'
tls_config:
ca_file: /etc/prom/consul.crt
cert_file: /etc/prom/consul-client.crt
key_file: /etc/prom/consul-client.key
ca_file: /etc/prometheus/consul.crt
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key
- job_name: 'smartctl-exporter'
scrape_interval: 300s
consul_sd_configs:
- server: 'https://localhost:8501'
services:
- 'smartctl-exporter'
tls_config:
ca_file: /etc/prometheus/consul.crt
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key
- job_name: 'garage'
authorization:
@ -25,9 +36,9 @@ scrape_configs:
services:
- 'garage-staging-admin'
tls_config:
ca_file: /etc/prom/consul.crt
cert_file: /etc/prom/consul-client.crt
key_file: /etc/prom/consul-client.key
ca_file: /etc/prometheus/consul.crt
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key
- job_name: 'tricot'
consul_sd_configs:
@ -35,9 +46,9 @@ scrape_configs:
services:
- 'tricot-metrics'
tls_config:
ca_file: /etc/prom/consul.crt
cert_file: /etc/prom/consul-client.crt
key_file: /etc/prom/consul-client.key
ca_file: /etc/prometheus/consul.crt
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config
# and https://www.nomadproject.io/api-docs/metrics
@ -50,15 +61,15 @@ scrape_configs:
format: ['prometheus']
scheme: 'https'
tls_config:
ca_file: /etc/prom/nomad-ca.crt
cert_file: /etc/prom/nomad-client.crt
key_file: /etc/prom/nomad-client.key
ca_file: /etc/prometheus/nomad-ca.crt
cert_file: /etc/prometheus/nomad-client.crt
key_file: /etc/prometheus/nomad-client.key
insecure_skip_verify: true
consul_sd_configs:
- server: 'https://localhost:8501'
services:
- 'nomad-client'
tls_config:
ca_file: /etc/prom/consul.crt
cert_file: /etc/prom/consul-client.crt
key_file: /etc/prom/consul-client.key
ca_file: /etc/prometheus/consul.crt
cert_file: /etc/prometheus/consul-client.crt
key_file: /etc/prometheus/consul-client.key

View file

@ -0,0 +1,170 @@
{
"defaultAction": "SCMP_ACT_ERRNO",
"defaultErrnoRet": 1,
"architectures": [
"SCMP_ARCH_X86_64"
],
"syscalls": [
{
"names": [
"rt_sigaction",
"rt_sigprocmask",
"getpid",
"fcntl",
"fstatfs",
"gettid",
"futex",
"getdents64",
"epoll_ctl",
"tgkill",
"openat",
"read",
"close",
"nanosleep",
"getsockname",
"setsockopt",
"chdir",
"capget",
"prctl",
"accept4",
"fstat",
"getcwd",
"setuid",
"setgid",
"setgroups",
"capset",
"newfstatat",
"write",
"writev",
"mmap",
"brk",
"rt_sigreturn",
"access",
"execve",
"getppid",
"exit_group",
"faccessat2",
"mprotect",
"pread64",
"arch_prctl",
"set_tid_address",
"set_robust_list",
"rseq",
"munmap",
"madvise",
"sigaltstack",
"statfs",
"waitid",
"readlinkat",
"eventfd2",
"epoll_create1",
"pipe2",
"pidfd_send_signal",
"pidfd_open",
"readlink",
"epoll_pwait",
"dup3",
"bind",
"listen",
"getrliimt",
"sched_getaffinity",
"sched_yield"
],
"action": "SCMP_ACT_ALLOW",
"comment": "globally needed by the go runtime"
},
{
"names": [
"open",
"uname"
],
"action": "SCMP_ACT_ALLOW",
"comment": "Used by smartctl"
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow SG_IO (aka SCSCI commands) on ioctl as it's what's used to read SMART data",
"args": [
{
"index": 1,
"value": 8837,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow NVME_IOCTL_ID command (0x4e40) on ioctl as it's what's used to read data on NVMe devices",
"args": [
{
"index": 1,
"value": 20032,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow NVME_IOCTL_ADMIN_CMD command (0xc0484e41) on ioctl as it's what's used to read data on NVMe devices. For some reason, it needs to be encoded as 0xffffffffc0484e41",
"args": [
{
"index": 1,
"value": 18446744072640548417,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"ioctl"
],
"action": "SCMP_ACT_ERRNO",
"comment": "Debug to allow/deny all ioctl (change to _LOG, _ALLOW, or _ERRNO appropriately)"
},
{
"names": [
"clone"
],
"action": "SCMP_ACT_ALLOW",
"comment": "partially allow clone as per docker config",
"args": [
{
"index": 0,
"value": 2114060288,
"op": "SCMP_CMP_MASKED_EQ"
}
]
},
{
"names": [
"clone3"
],
"action": "SCMP_ACT_ERRNO",
"comment": "disable clone3 in a specific way as per docker's default config",
"errnoRet": 38
},
{
"names": [
"socket"
],
"action": "SCMP_ACT_ALLOW",
"comment": "allow IPv4 sockets",
"args": [
{
"index": 0,
"value": 2,
"op": "SCMP_CMP_EQ"
}
]
}
]
}

View file

@ -0,0 +1,51 @@
job "telemetry-node-exporter" {
datacenters = ["neptune", "dathomir", "corrin", "bespin"]
type = "system"
priority = "100"
group "node_exporter" {
network {
port "node_exporter" { static = 9100 }
}
task "node_exporter" {
driver = "docker"
config {
image = "quay.io/prometheus/node-exporter:v1.8.1"
ports = ["node_exporter"]
volumes = [
"/:/host:ro,rslave"
]
args = [
"--web.listen-address=0.0.0.0:${NOMAD_PORT_node_exporter}",
"--path.rootfs=/host"
]
}
resources {
cpu = 50
memory = 40
}
service {
tags = [ "telemetry" ]
port = "node_exporter"
name = "node-exporter"
check {
type = "http"
path = "/"
port = 9100
address_mode = "driver"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -0,0 +1,131 @@
job "telemetry-smartctl-exporter" {
datacenters = ["neptune", "dathomir", "corrin", "bespin"]
type = "system"
priority = "100"
group "smartctl_exporter" {
network {
port "smartctl_exporter" { static = 9101 }
}
# This init task creates "fake" disk devices. This way, we can
# restrict which devices we expose to smartctl_exporter while having
# the same task configuration on all hosts.
task "create_fake_disks" {
driver = "docker"
user = "root"
config {
image = "bash:5.2.37"
args = [
"-x", "${NOMAD_TASK_DIR}/create_fake_disks.sh"
]
readonly_rootfs = true
mount {
type = "bind"
target = "/dev"
source = "/dev"
readonly = false
}
}
template {
data = <<EOF
echo "Checking existing disks and creating fake devices if needed..."
[ -a "/dev/sda" ] || ln -s /dev/null /dev/sda
[ -a "/dev/sdb" ] || ln -s /dev/null /dev/sdb
[ -a "/dev/nvme0" ] || ln -s /dev/null /dev/nvme0
EOF
destination = "local/create_fake_disks.sh"
perms = 755
}
resources {
cpu = 10
memory = 10
}
lifecycle {
hook = "prestart"
sidecar = false
}
}
task "smartctl_exporter" {
driver = "docker"
# Necessary to use low-level SMART and NVMe commands
user = "root"
config {
image = "prometheuscommunity/smartctl-exporter:v0.13.0"
args = [
"--web.listen-address=0.0.0.0:${NOMAD_PORT_smartctl_exporter}"
]
ports = ["smartctl_exporter"]
# CAP_SYS_RAWIO is needed for SMART requests, while CAP_SYS_ADMIN
# is needed for NVMe requests.
# These capabilities need to be allowed in the Nomad client config.
cap_drop = ["all"]
cap_add = ["CAP_SYS_RAWIO", "CAP_SYS_ADMIN"]
# Hardening options to avoid running the container as privileged,
# while still allowing just enough syscalls so that smartctl can query the disks.
security_opt = [
"no-new-privileges",
# Apparently there is no variable to determine the path to the allocation, hence this hack
"seccomp=/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}/local/smartctl-seccomp.json",
]
readonly_rootfs = true
# Sadly, devices must exist for Docker to accept this option, otherwise it fails to run.
# This is why we create "fake" devices in the init task above.
devices = [
{
host_path = "/dev/sda"
container_path = "/dev/sda"
cgroup_permissions = "r"
},
{
host_path = "/dev/sdb"
container_path = "/dev/sdb"
cgroup_permissions = "r"
},
{
host_path = "/dev/nvme0"
container_path = "/dev/nvme0"
cgroup_permissions = "r"
}
]
}
template {
data = file("../config/smartctl-seccomp.json")
destination = "local/smartctl-seccomp.json"
perms = 444
}
resources {
cpu = 50
memory = 40
}
service {
tags = [ "telemetry" ]
port = "smartctl_exporter"
name = "smartctl-exporter"
check {
type = "http"
path = "/"
port = 9101
address_mode = "driver"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -1,49 +0,0 @@
job "telemetry-system" {
datacenters = ["neptune", "dathomir", "corrin", "bespin"]
type = "system"
priority = "100"
group "collector" {
network {
port "node_exporter" { static = 9100 }
}
task "node_exporter" {
driver = "docker"
config {
image = "quay.io/prometheus/node-exporter:v1.8.1"
network_mode = "host"
volumes = [
"/:/host:ro,rslave"
]
args = [ "--path.rootfs=/host" ]
}
resources {
cpu = 50
memory = 40
}
service {
tags = [ "telemetry" ]
port = 9100
address_mode = "driver"
name = "node-exporter"
check {
type = "http"
path = "/"
port = 9100
address_mode = "driver"
interval = "60s"
timeout = "5s"
check_restart {
limit = 3
grace = "90s"
ignore_warnings = false
}
}
}
}
}
}

View file

@ -36,10 +36,15 @@
deuxfleurs.wgautomeshPort = 1667;
deuxfleurs.services.wgautomesh.logLevel = "debug";
programs.vim = {
enable = true;
defaultEditor = true;
};
# Bootstrap IPs for Consul cluster,
# these are IPs on the Wireguard overlay
services.consul.extraConfig.retry_join = [
"10.14.1.3" # caribou
"10.14.4.1" # df-pw5
"10.14.2.33" # origan
"10.14.3.1" # piranha
];

View file

@ -10,7 +10,6 @@
deuxfleurs.hostName = "caribou";
deuxfleurs.staticIPv6.address = "2a01:e34:ec05:8a40::23";
deuxfleurs.isRaftServer = true;
# this denote the version at install time, do not update
system.stateVersion = "21.05";

View file

@ -11,6 +11,7 @@
deuxfleurs.hostName = "piranha";
deuxfleurs.staticIPv4.address = "192.168.5.25";
deuxfleurs.staticIPv6.address = "2001:912:1ac0:2200::25";
deuxfleurs.isRaftServer = true;
# this denote the version at install time, do not update
system.stateVersion = "24.05";

View file

@ -26,6 +26,11 @@ if [ "$CLUSTER" = "prod" ]; then
message "2. Reboot node manually. You can also take the opportunity to upgrade with:"
message " REBOOT_NODES=yes ./upgrade_nixos prod $NIXHOST"
message "3. Mark node as eligible again in Nomad"
message ""
message "If you are certain that the update is not disruptive, you can manually apply changes:"
message "1. Connect to node '$NIXHOST' over SSH"
message "2. Run this on the node:"
message " sudo nixos-rebuild switch"
message "-------------------------------------------------------------------------------------"
else
cmd nixos-rebuild switch

View file

@ -47,5 +47,6 @@ ports so that we can avoid conflicts when adding services.
8999 opendkim
9090 prometheus
9100 node_exporter
9101 smartctl_exporter
9334 tricot metrics
9991 guichet

View file

@ -68,11 +68,6 @@ SystemMaxUse=1G
# Enable support for all terminal emulators such as urxvt
environment.enableAllTerminfo = true;
programs.vim = {
enable = true;
defaultEditor = true;
};
# Enable network time
services.ntp.enable = false;
services.timesyncd.enable = true;

View file

@ -397,6 +397,7 @@ in
{
volumes.enabled = true;
allow_privileged = true;
allow_caps = ["all"];
}
];
}