Compare commits
21 commits
8750309014
...
8418c40250
Author | SHA1 | Date | |
---|---|---|---|
8418c40250 | |||
|
fe68fdf54a | ||
|
187d36eb9b | ||
|
fd6275f5bc | ||
|
fc88a063b1 | ||
|
bb8c9db2ed | ||
451068d716 | |||
|
797f946578 | ||
|
596b7ab966 | ||
|
ec1fa3e540 | ||
67230dd60c | |||
305c160899 | |||
|
8d9aa00de5 | ||
|
5790453ff1 | ||
|
a2a470ac3d | ||
|
2009572fea | ||
|
8f0a45f03e | ||
|
b98e72af96 | ||
|
e805cf5cf6 | ||
6b52ccd374 | |||
|
c5a0577cbf |
23 changed files with 909 additions and 136 deletions
|
@ -3,11 +3,6 @@ job "garage" {
|
|||
type = "system"
|
||||
priority = 80
|
||||
|
||||
update {
|
||||
max_parallel = 2
|
||||
min_healthy_time = "60s"
|
||||
}
|
||||
|
||||
group "garage" {
|
||||
network {
|
||||
port "s3" { static = 3900 }
|
||||
|
@ -18,7 +13,11 @@ job "garage" {
|
|||
}
|
||||
|
||||
update {
|
||||
max_parallel = 10
|
||||
# When upgrading the service, stop and upgrade nodes one by one.
|
||||
# This should allow performing minor upgrades without downtime.
|
||||
# (A higher value for max_parallel would risk stopping nodes in different
|
||||
# zones at the same time, which would make the cluster unavailable)
|
||||
max_parallel = 1
|
||||
min_healthy_time = "30s"
|
||||
healthy_deadline = "5m"
|
||||
}
|
||||
|
@ -26,7 +25,7 @@ job "garage" {
|
|||
task "server" {
|
||||
driver = "docker"
|
||||
config {
|
||||
image = "superboum/garage:v1.0.0-rc1-hotfix-red-ftr-wquorum"
|
||||
image = "dxflrs/garage:v1.99.1-internal"
|
||||
command = "/garage"
|
||||
args = [ "server" ]
|
||||
network_mode = "host"
|
||||
|
|
|
@ -13,7 +13,7 @@ job "guichet" {
|
|||
task "guichet" {
|
||||
driver = "docker"
|
||||
config {
|
||||
image = "dxflrs/guichet:0x4y7bj1qb8w8hckvpbzlgyxh63j66ij"
|
||||
image = "dxflrs/guichet:aqd78kjs4dmizm8gax67d8sd68l9gcf5"
|
||||
args = [ "server", "-config", "/etc/config.json" ]
|
||||
readonly_rootfs = true
|
||||
ports = [ "web_port" ]
|
||||
|
|
|
@ -17,8 +17,8 @@ job "plume-blog" {
|
|||
network_mode = "host"
|
||||
ports = [ "cache_port" ]
|
||||
|
||||
# cache
|
||||
mount {
|
||||
# cache
|
||||
mount {
|
||||
type = "tmpfs"
|
||||
target = "/var/lib/varnish/varnishd:exec"
|
||||
readonly = false
|
||||
|
@ -64,8 +64,8 @@ job "plume-blog" {
|
|||
}
|
||||
|
||||
resources {
|
||||
memory = 1024
|
||||
memory_max = 1024
|
||||
memory = 1500
|
||||
memory_max = 1500
|
||||
cpu = 100
|
||||
}
|
||||
|
||||
|
|
|
@ -16,6 +16,17 @@ scrape_configs:
|
|||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
||||
- job_name: 'smartctl-exporter'
|
||||
scrape_interval: 300s
|
||||
consul_sd_configs:
|
||||
- server: 'https://localhost:8501'
|
||||
services:
|
||||
- 'smartctl-exporter'
|
||||
tls_config:
|
||||
ca_file: /etc/prometheus/consul-ca.crt
|
||||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
||||
- job_name: 'tricot'
|
||||
consul_sd_configs:
|
||||
- server: 'https://localhost:8501'
|
||||
|
|
170
cluster/prod/app/telemetry/config/smartctl-seccomp.json
Normal file
170
cluster/prod/app/telemetry/config/smartctl-seccomp.json
Normal file
|
@ -0,0 +1,170 @@
|
|||
{
|
||||
"defaultAction": "SCMP_ACT_ERRNO",
|
||||
"defaultErrnoRet": 1,
|
||||
"architectures": [
|
||||
"SCMP_ARCH_X86_64"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"rt_sigaction",
|
||||
"rt_sigprocmask",
|
||||
"getpid",
|
||||
"fcntl",
|
||||
"fstatfs",
|
||||
"gettid",
|
||||
"futex",
|
||||
"getdents64",
|
||||
"epoll_ctl",
|
||||
"tgkill",
|
||||
"openat",
|
||||
"read",
|
||||
"close",
|
||||
"nanosleep",
|
||||
"getsockname",
|
||||
"setsockopt",
|
||||
"chdir",
|
||||
"capget",
|
||||
"prctl",
|
||||
"accept4",
|
||||
"fstat",
|
||||
"getcwd",
|
||||
"setuid",
|
||||
"setgid",
|
||||
"setgroups",
|
||||
"capset",
|
||||
"newfstatat",
|
||||
"write",
|
||||
"writev",
|
||||
"mmap",
|
||||
"brk",
|
||||
"rt_sigreturn",
|
||||
"access",
|
||||
"execve",
|
||||
"getppid",
|
||||
"exit_group",
|
||||
"faccessat2",
|
||||
"mprotect",
|
||||
"pread64",
|
||||
"arch_prctl",
|
||||
"set_tid_address",
|
||||
"set_robust_list",
|
||||
"rseq",
|
||||
"munmap",
|
||||
"madvise",
|
||||
"sigaltstack",
|
||||
"statfs",
|
||||
"waitid",
|
||||
"readlinkat",
|
||||
"eventfd2",
|
||||
"epoll_create1",
|
||||
"pipe2",
|
||||
"pidfd_send_signal",
|
||||
"pidfd_open",
|
||||
"readlink",
|
||||
"epoll_pwait",
|
||||
"dup3",
|
||||
"bind",
|
||||
"listen",
|
||||
"getrliimt",
|
||||
"sched_getaffinity",
|
||||
"sched_yield"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "globally needed by the go runtime"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"open",
|
||||
"uname"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "Used by smartctl"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow SG_IO (aka SCSCI commands) on ioctl as it's what's used to read SMART data",
|
||||
"args": [
|
||||
{
|
||||
"index": 1,
|
||||
"value": 8837,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow NVME_IOCTL_ID command (0x4e40) on ioctl as it's what's used to read data on NVMe devices",
|
||||
"args": [
|
||||
{
|
||||
"index": 1,
|
||||
"value": 20032,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow NVME_IOCTL_ADMIN_CMD command (0xc0484e41) on ioctl as it's what's used to read data on NVMe devices. For some reason, it needs to be encoded as 0xffffffffc0484e41",
|
||||
"args": [
|
||||
{
|
||||
"index": 1,
|
||||
"value": 18446744072640548417,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"comment": "Debug to allow/deny all ioctl (change to _LOG, _ALLOW, or _ERRNO appropriately)"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"clone"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "partially allow clone as per docker config",
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 2114060288,
|
||||
"op": "SCMP_CMP_MASKED_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"clone3"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"comment": "disable clone3 in a specific way as per docker's default config",
|
||||
"errnoRet": 38
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"socket"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow IPv4 sockets",
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 2,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
job "telemetry-node-exporter" {
|
||||
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
|
||||
type = "system"
|
||||
priority = "100"
|
||||
|
||||
group "node_exporter" {
|
||||
network {
|
||||
port "node_exporter" { static = 9100 }
|
||||
}
|
||||
|
||||
task "node_exporter" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "quay.io/prometheus/node-exporter:v1.8.1"
|
||||
ports = ["node_exporter"]
|
||||
volumes = [
|
||||
"/:/host:ro,rslave"
|
||||
]
|
||||
args = [
|
||||
"--web.listen-address=0.0.0.0:${NOMAD_PORT_node_exporter}",
|
||||
"--path.rootfs=/host"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 40
|
||||
}
|
||||
|
||||
service {
|
||||
tags = [ "telemetry" ]
|
||||
port = "node_exporter"
|
||||
name = "node-exporter"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/"
|
||||
port = 9100
|
||||
address_mode = "driver"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
job "telemetry-smartctl-exporter" {
|
||||
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
|
||||
type = "system"
|
||||
priority = "100"
|
||||
|
||||
group "smartctl_exporter" {
|
||||
network {
|
||||
port "smartctl_exporter" { static = 9101 }
|
||||
}
|
||||
|
||||
# This init task creates "fake" disk devices. This way, we can
|
||||
# restrict which devices we expose to smartctl_exporter while having
|
||||
# the same task configuration on all hosts.
|
||||
task "create_fake_disks" {
|
||||
driver = "docker"
|
||||
user = "root"
|
||||
|
||||
config {
|
||||
image = "bash:5.2.37"
|
||||
args = [
|
||||
"-x", "${NOMAD_TASK_DIR}/create_fake_disks.sh"
|
||||
]
|
||||
readonly_rootfs = true
|
||||
|
||||
mount {
|
||||
type = "bind"
|
||||
target = "/dev"
|
||||
source = "/dev"
|
||||
readonly = false
|
||||
}
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<EOF
|
||||
echo "Checking existing disks and creating fake devices if needed..."
|
||||
[ -a "/dev/sda" ] || ln -s /dev/null /dev/sda
|
||||
[ -a "/dev/sdb" ] || ln -s /dev/null /dev/sdb
|
||||
[ -a "/dev/nvme0" ] || ln -s /dev/null /dev/nvme0
|
||||
EOF
|
||||
destination = "local/create_fake_disks.sh"
|
||||
perms = 755
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
sidecar = false
|
||||
}
|
||||
}
|
||||
|
||||
task "smartctl_exporter" {
|
||||
driver = "docker"
|
||||
# Necessary to use low-level SMART and NVMe commands
|
||||
user = "root"
|
||||
|
||||
config {
|
||||
image = "prometheuscommunity/smartctl-exporter:v0.13.0"
|
||||
args = [
|
||||
"--web.listen-address=0.0.0.0:${NOMAD_PORT_smartctl_exporter}"
|
||||
]
|
||||
ports = ["smartctl_exporter"]
|
||||
# CAP_SYS_RAWIO is needed for SMART requests, while CAP_SYS_ADMIN
|
||||
# is needed for NVMe requests.
|
||||
# These capabilities need to be allowed in the Nomad client config.
|
||||
cap_drop = ["all"]
|
||||
cap_add = ["CAP_SYS_RAWIO", "CAP_SYS_ADMIN"]
|
||||
# Hardening options to avoid running the container as privileged,
|
||||
# while still allowing just enough syscalls so that smartctl can query the disks.
|
||||
security_opt = [
|
||||
"no-new-privileges",
|
||||
# Apparently there is no variable to determine the path to the allocation, hence this hack
|
||||
"seccomp=/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}/local/smartctl-seccomp.json",
|
||||
]
|
||||
readonly_rootfs = true
|
||||
# Sadly, devices must exist for Docker to accept this option, otherwise it fails to run.
|
||||
# This is why we create "fake" devices in the init task above.
|
||||
devices = [
|
||||
{
|
||||
host_path = "/dev/sda"
|
||||
container_path = "/dev/sda"
|
||||
cgroup_permissions = "r"
|
||||
},
|
||||
{
|
||||
host_path = "/dev/sdb"
|
||||
container_path = "/dev/sdb"
|
||||
cgroup_permissions = "r"
|
||||
},
|
||||
{
|
||||
host_path = "/dev/nvme0"
|
||||
container_path = "/dev/nvme0"
|
||||
cgroup_permissions = "r"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
template {
|
||||
data = file("../config/smartctl-seccomp.json")
|
||||
destination = "local/smartctl-seccomp.json"
|
||||
perms = 444
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 40
|
||||
}
|
||||
|
||||
service {
|
||||
tags = [ "telemetry" ]
|
||||
port = "smartctl_exporter"
|
||||
name = "smartctl-exporter"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/"
|
||||
port = 9101
|
||||
address_mode = "driver"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
job "telemetry-storage" {
|
||||
datacenters = ["scorpio", "bespin"]
|
||||
datacenters = ["scorpio", "bespin", "corrin"]
|
||||
type = "service"
|
||||
|
||||
group "prometheus" {
|
||||
|
@ -14,7 +14,7 @@ job "telemetry-storage" {
|
|||
constraint {
|
||||
attribute = "${attr.unique.hostname}"
|
||||
operator = "set_contains_any"
|
||||
value = "ananas,df-ymk"
|
||||
value = "pamplemousse,df-ymk"
|
||||
}
|
||||
|
||||
task "prometheus" {
|
||||
|
@ -26,7 +26,7 @@ job "telemetry-storage" {
|
|||
args = [
|
||||
"--config.file=/etc/prometheus/prometheus.yml",
|
||||
"--storage.tsdb.path=/data",
|
||||
"--storage.tsdb.retention.size=20GB",
|
||||
"--storage.tsdb.retention.size=40GB",
|
||||
]
|
||||
volumes = [
|
||||
"secrets:/etc/prometheus",
|
||||
|
@ -72,7 +72,7 @@ job "telemetry-storage" {
|
|||
resources {
|
||||
memory = 1500
|
||||
memory_max = 4000
|
||||
cpu = 1000
|
||||
cpu = 400
|
||||
}
|
||||
|
||||
service {
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
job "telemetry-system" {
|
||||
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
|
||||
type = "system"
|
||||
priority = "100"
|
||||
|
||||
group "collector" {
|
||||
network {
|
||||
port "node_exporter" { static = 9100 }
|
||||
}
|
||||
|
||||
task "node_exporter" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "quay.io/prometheus/node-exporter:v1.8.1"
|
||||
network_mode = "host"
|
||||
volumes = [
|
||||
"/:/host:ro,rslave"
|
||||
]
|
||||
args = [ "--path.rootfs=/host" ]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 40
|
||||
}
|
||||
|
||||
service {
|
||||
tags = [ "telemetry" ]
|
||||
port = 9100
|
||||
address_mode = "driver"
|
||||
name = "node-exporter"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/"
|
||||
port = 9100
|
||||
address_mode = "driver"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -6,6 +6,8 @@
|
|||
# The IP range to use for the Wireguard overlay of this cluster
|
||||
deuxfleurs.clusterPrefix = "10.83.0.0/16";
|
||||
|
||||
programs.vim.defaultEditor = true;
|
||||
|
||||
deuxfleurs.clusterNodes = {
|
||||
"df-ykl" = {
|
||||
siteName = "bespin";
|
||||
|
|
137
cluster/staging/app/git/deploy/forgejo.hcl
Normal file
137
cluster/staging/app/git/deploy/forgejo.hcl
Normal file
|
@ -0,0 +1,137 @@
|
|||
job "git" {
|
||||
datacenters = ["bespin"]
|
||||
type = "service"
|
||||
|
||||
group "forgejo" {
|
||||
count = 1
|
||||
|
||||
network {
|
||||
port "http" { static = 3000 }
|
||||
port "ssh" { static = 22 }
|
||||
}
|
||||
|
||||
ephemeral_disk {
|
||||
size = 10000
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 10
|
||||
delay = "30s"
|
||||
}
|
||||
|
||||
task "forgejo" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "codeberg.org/forgejo/forgejo:10.0.2"
|
||||
network_mode = "host"
|
||||
readonly_rootfs = true
|
||||
ports = [ "http", "ssh" ]
|
||||
volumes = [
|
||||
"/ssd/forgejo:/data",
|
||||
"/etc/timezone:/etc/timezone:ro",
|
||||
"/etc/localtime:/etc/localtime:ro"
|
||||
]
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<ENV
|
||||
USER_UID = 106
|
||||
USER_GID = 112
|
||||
DB_TYPE = postgres
|
||||
DB_HOST = db:3306
|
||||
GITEA__database__NAME = gitea
|
||||
GITEA__database__USER = gitea
|
||||
GITEA__database__PASSWD = "{{ key \"secrets/git/forgejo/database_password\" }}"
|
||||
# Mailer credentials
|
||||
GITEA__mailer__USER = _system._gitea@deuxfleurs.fr
|
||||
GITEA__mailer__PASSWD = "{{ key \"secrets/git/forgejo/mailer_password\" }}"
|
||||
# General configuration
|
||||
GITEA__server__DOMAIN = git.staging.deuxfleurs.org
|
||||
GITEA__server__SSH_DOMAIN = git.staging.deuxfleurs.org
|
||||
GITEA__server__ROOT_URL = https://git.staging.deuxfleurs.org
|
||||
GITEA__server__LFS_START_SERVER = true
|
||||
GITEA__database__DB_TYPE = mysql
|
||||
GITEA__database__HOST = db
|
||||
GITEA__mailer__ENABLED = true
|
||||
GITEA__mailer__SMTP_ADDR = smtp.deuxfleurs.fr
|
||||
GITEA__mailer__SMTP_PORT = 465
|
||||
GITEA__mailer__PROTOCOL = smtps
|
||||
GITEA__mailer__FROM = Deuxfleurs Gitea <_system._forjego@staging.deuxfleurs.org>
|
||||
GITEA__mailer__FORCE_TRUST_SERVER_CERT = true
|
||||
GITEA__mailer__SUBJECT_PREFIX = [Deuxfleurs Forgejo Staging]
|
||||
GITEA__service__REGISTER_EMAIL_CONFIRM = true
|
||||
GITEA__service__ENABLE_NOTIFY_MAIL = true
|
||||
GITEA__admin__DEFAULT_EMAIL_NOTIFICATIONS = enabled
|
||||
GITEA__lfs__PATH = /data/git/lfs
|
||||
# Prevent spam accounts
|
||||
GITEA__service__DEFAULT_ALLOW_CREATE_ORGANIZATION = false
|
||||
GITEA__service__DEFAULT_USER_VISIBILITY = limited
|
||||
GITEA__service__DEFAULT_KEEP_EMAIL_PRIVATE = true
|
||||
# Allow CORS for StaticCMS (a fork of Netlify CMS)
|
||||
GITEA__cors__ENABLED = true
|
||||
GITEA__cors__ALLOW_DOMAIN = *
|
||||
GITEA__cors__ALLOW_CREDENTIALS = true
|
||||
GITEA__cors__METHODS = GET,HEAD,POST,PUT,PATCH,DELETE,OPTIONS
|
||||
GITEA__cors__SCHEME = *
|
||||
GITEA__cors__HEADERS = Content-Type,User-Agent,Authorization
|
||||
# Options passed to Gitea
|
||||
# see https://docs.gitea.io/en-us/config-cheat-sheet/
|
||||
# config is in /data/gitea/config/app.ini
|
||||
GITEA__ui__ISSUE_PAGING_NUM = 20
|
||||
ENV
|
||||
destination = "secrets/env"
|
||||
env = true
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 1000
|
||||
memory = 1000
|
||||
memory_max = 1000
|
||||
}
|
||||
|
||||
service {
|
||||
name = "forgejo-ssh"
|
||||
port = "ssh"
|
||||
address_mode = "host"
|
||||
tags = [
|
||||
"forgejo-staging-ssh",
|
||||
"tricot git.staging.deuxfleurs.org 100",
|
||||
"d53-cname git.staging.deuxfleurs.org",
|
||||
]
|
||||
check {
|
||||
type = "tcp"
|
||||
port = "ssh"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
service {
|
||||
name = "forgejo-http"
|
||||
port = "http"
|
||||
address_mode = "host"
|
||||
tags = [
|
||||
"forgejo-staging-http",
|
||||
"tricot-add-header Access-Control-Allow-Origin *",
|
||||
"d53-cname git.staging.deuxfleurs.org",
|
||||
]
|
||||
check {
|
||||
type = "tcp"
|
||||
port = "http"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -12,9 +12,20 @@ scrape_configs:
|
|||
services:
|
||||
- 'node-exporter'
|
||||
tls_config:
|
||||
ca_file: /etc/prom/consul.crt
|
||||
cert_file: /etc/prom/consul-client.crt
|
||||
key_file: /etc/prom/consul-client.key
|
||||
ca_file: /etc/prometheus/consul.crt
|
||||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
||||
- job_name: 'smartctl-exporter'
|
||||
scrape_interval: 300s
|
||||
consul_sd_configs:
|
||||
- server: 'https://localhost:8501'
|
||||
services:
|
||||
- 'smartctl-exporter'
|
||||
tls_config:
|
||||
ca_file: /etc/prometheus/consul.crt
|
||||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
||||
- job_name: 'garage'
|
||||
authorization:
|
||||
|
@ -25,9 +36,9 @@ scrape_configs:
|
|||
services:
|
||||
- 'garage-staging-admin'
|
||||
tls_config:
|
||||
ca_file: /etc/prom/consul.crt
|
||||
cert_file: /etc/prom/consul-client.crt
|
||||
key_file: /etc/prom/consul-client.key
|
||||
ca_file: /etc/prometheus/consul.crt
|
||||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
||||
- job_name: 'tricot'
|
||||
consul_sd_configs:
|
||||
|
@ -35,9 +46,9 @@ scrape_configs:
|
|||
services:
|
||||
- 'tricot-metrics'
|
||||
tls_config:
|
||||
ca_file: /etc/prom/consul.crt
|
||||
cert_file: /etc/prom/consul-client.crt
|
||||
key_file: /etc/prom/consul-client.key
|
||||
ca_file: /etc/prometheus/consul.crt
|
||||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
||||
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config
|
||||
# and https://www.nomadproject.io/api-docs/metrics
|
||||
|
@ -50,15 +61,15 @@ scrape_configs:
|
|||
format: ['prometheus']
|
||||
scheme: 'https'
|
||||
tls_config:
|
||||
ca_file: /etc/prom/nomad-ca.crt
|
||||
cert_file: /etc/prom/nomad-client.crt
|
||||
key_file: /etc/prom/nomad-client.key
|
||||
ca_file: /etc/prometheus/nomad-ca.crt
|
||||
cert_file: /etc/prometheus/nomad-client.crt
|
||||
key_file: /etc/prometheus/nomad-client.key
|
||||
insecure_skip_verify: true
|
||||
consul_sd_configs:
|
||||
- server: 'https://localhost:8501'
|
||||
services:
|
||||
- 'nomad-client'
|
||||
tls_config:
|
||||
ca_file: /etc/prom/consul.crt
|
||||
cert_file: /etc/prom/consul-client.crt
|
||||
key_file: /etc/prom/consul-client.key
|
||||
ca_file: /etc/prometheus/consul.crt
|
||||
cert_file: /etc/prometheus/consul-client.crt
|
||||
key_file: /etc/prometheus/consul-client.key
|
||||
|
|
170
cluster/staging/app/telemetry/config/smartctl-seccomp.json
Normal file
170
cluster/staging/app/telemetry/config/smartctl-seccomp.json
Normal file
|
@ -0,0 +1,170 @@
|
|||
{
|
||||
"defaultAction": "SCMP_ACT_ERRNO",
|
||||
"defaultErrnoRet": 1,
|
||||
"architectures": [
|
||||
"SCMP_ARCH_X86_64"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"rt_sigaction",
|
||||
"rt_sigprocmask",
|
||||
"getpid",
|
||||
"fcntl",
|
||||
"fstatfs",
|
||||
"gettid",
|
||||
"futex",
|
||||
"getdents64",
|
||||
"epoll_ctl",
|
||||
"tgkill",
|
||||
"openat",
|
||||
"read",
|
||||
"close",
|
||||
"nanosleep",
|
||||
"getsockname",
|
||||
"setsockopt",
|
||||
"chdir",
|
||||
"capget",
|
||||
"prctl",
|
||||
"accept4",
|
||||
"fstat",
|
||||
"getcwd",
|
||||
"setuid",
|
||||
"setgid",
|
||||
"setgroups",
|
||||
"capset",
|
||||
"newfstatat",
|
||||
"write",
|
||||
"writev",
|
||||
"mmap",
|
||||
"brk",
|
||||
"rt_sigreturn",
|
||||
"access",
|
||||
"execve",
|
||||
"getppid",
|
||||
"exit_group",
|
||||
"faccessat2",
|
||||
"mprotect",
|
||||
"pread64",
|
||||
"arch_prctl",
|
||||
"set_tid_address",
|
||||
"set_robust_list",
|
||||
"rseq",
|
||||
"munmap",
|
||||
"madvise",
|
||||
"sigaltstack",
|
||||
"statfs",
|
||||
"waitid",
|
||||
"readlinkat",
|
||||
"eventfd2",
|
||||
"epoll_create1",
|
||||
"pipe2",
|
||||
"pidfd_send_signal",
|
||||
"pidfd_open",
|
||||
"readlink",
|
||||
"epoll_pwait",
|
||||
"dup3",
|
||||
"bind",
|
||||
"listen",
|
||||
"getrliimt",
|
||||
"sched_getaffinity",
|
||||
"sched_yield"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "globally needed by the go runtime"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"open",
|
||||
"uname"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "Used by smartctl"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow SG_IO (aka SCSCI commands) on ioctl as it's what's used to read SMART data",
|
||||
"args": [
|
||||
{
|
||||
"index": 1,
|
||||
"value": 8837,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow NVME_IOCTL_ID command (0x4e40) on ioctl as it's what's used to read data on NVMe devices",
|
||||
"args": [
|
||||
{
|
||||
"index": 1,
|
||||
"value": 20032,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow NVME_IOCTL_ADMIN_CMD command (0xc0484e41) on ioctl as it's what's used to read data on NVMe devices. For some reason, it needs to be encoded as 0xffffffffc0484e41",
|
||||
"args": [
|
||||
{
|
||||
"index": 1,
|
||||
"value": 18446744072640548417,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"ioctl"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"comment": "Debug to allow/deny all ioctl (change to _LOG, _ALLOW, or _ERRNO appropriately)"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"clone"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "partially allow clone as per docker config",
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 2114060288,
|
||||
"op": "SCMP_CMP_MASKED_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"clone3"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"comment": "disable clone3 in a specific way as per docker's default config",
|
||||
"errnoRet": 38
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"socket"
|
||||
],
|
||||
"action": "SCMP_ACT_ALLOW",
|
||||
"comment": "allow IPv4 sockets",
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 2,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
job "telemetry-node-exporter" {
|
||||
datacenters = ["neptune", "dathomir", "corrin", "bespin"]
|
||||
type = "system"
|
||||
priority = "100"
|
||||
|
||||
group "node_exporter" {
|
||||
network {
|
||||
port "node_exporter" { static = 9100 }
|
||||
}
|
||||
|
||||
task "node_exporter" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "quay.io/prometheus/node-exporter:v1.8.1"
|
||||
ports = ["node_exporter"]
|
||||
volumes = [
|
||||
"/:/host:ro,rslave"
|
||||
]
|
||||
args = [
|
||||
"--web.listen-address=0.0.0.0:${NOMAD_PORT_node_exporter}",
|
||||
"--path.rootfs=/host"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 40
|
||||
}
|
||||
|
||||
service {
|
||||
tags = [ "telemetry" ]
|
||||
port = "node_exporter"
|
||||
name = "node-exporter"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/"
|
||||
port = 9100
|
||||
address_mode = "driver"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
job "telemetry-smartctl-exporter" {
|
||||
datacenters = ["neptune", "dathomir", "corrin", "bespin"]
|
||||
type = "system"
|
||||
priority = "100"
|
||||
|
||||
group "smartctl_exporter" {
|
||||
network {
|
||||
port "smartctl_exporter" { static = 9101 }
|
||||
}
|
||||
|
||||
# This init task creates "fake" disk devices. This way, we can
|
||||
# restrict which devices we expose to smartctl_exporter while having
|
||||
# the same task configuration on all hosts.
|
||||
task "create_fake_disks" {
|
||||
driver = "docker"
|
||||
user = "root"
|
||||
|
||||
config {
|
||||
image = "bash:5.2.37"
|
||||
args = [
|
||||
"-x", "${NOMAD_TASK_DIR}/create_fake_disks.sh"
|
||||
]
|
||||
readonly_rootfs = true
|
||||
|
||||
mount {
|
||||
type = "bind"
|
||||
target = "/dev"
|
||||
source = "/dev"
|
||||
readonly = false
|
||||
}
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<EOF
|
||||
echo "Checking existing disks and creating fake devices if needed..."
|
||||
[ -a "/dev/sda" ] || ln -s /dev/null /dev/sda
|
||||
[ -a "/dev/sdb" ] || ln -s /dev/null /dev/sdb
|
||||
[ -a "/dev/nvme0" ] || ln -s /dev/null /dev/nvme0
|
||||
EOF
|
||||
destination = "local/create_fake_disks.sh"
|
||||
perms = 755
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "prestart"
|
||||
sidecar = false
|
||||
}
|
||||
}
|
||||
|
||||
task "smartctl_exporter" {
|
||||
driver = "docker"
|
||||
# Necessary to use low-level SMART and NVMe commands
|
||||
user = "root"
|
||||
|
||||
config {
|
||||
image = "prometheuscommunity/smartctl-exporter:v0.13.0"
|
||||
args = [
|
||||
"--web.listen-address=0.0.0.0:${NOMAD_PORT_smartctl_exporter}"
|
||||
]
|
||||
ports = ["smartctl_exporter"]
|
||||
# CAP_SYS_RAWIO is needed for SMART requests, while CAP_SYS_ADMIN
|
||||
# is needed for NVMe requests.
|
||||
# These capabilities need to be allowed in the Nomad client config.
|
||||
cap_drop = ["all"]
|
||||
cap_add = ["CAP_SYS_RAWIO", "CAP_SYS_ADMIN"]
|
||||
# Hardening options to avoid running the container as privileged,
|
||||
# while still allowing just enough syscalls so that smartctl can query the disks.
|
||||
security_opt = [
|
||||
"no-new-privileges",
|
||||
# Apparently there is no variable to determine the path to the allocation, hence this hack
|
||||
"seccomp=/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}/local/smartctl-seccomp.json",
|
||||
]
|
||||
readonly_rootfs = true
|
||||
# Sadly, devices must exist for Docker to accept this option, otherwise it fails to run.
|
||||
# This is why we create "fake" devices in the init task above.
|
||||
devices = [
|
||||
{
|
||||
host_path = "/dev/sda"
|
||||
container_path = "/dev/sda"
|
||||
cgroup_permissions = "r"
|
||||
},
|
||||
{
|
||||
host_path = "/dev/sdb"
|
||||
container_path = "/dev/sdb"
|
||||
cgroup_permissions = "r"
|
||||
},
|
||||
{
|
||||
host_path = "/dev/nvme0"
|
||||
container_path = "/dev/nvme0"
|
||||
cgroup_permissions = "r"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
template {
|
||||
data = file("../config/smartctl-seccomp.json")
|
||||
destination = "local/smartctl-seccomp.json"
|
||||
perms = 444
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 40
|
||||
}
|
||||
|
||||
service {
|
||||
tags = [ "telemetry" ]
|
||||
port = "smartctl_exporter"
|
||||
name = "smartctl-exporter"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/"
|
||||
port = 9101
|
||||
address_mode = "driver"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
job "telemetry-system" {
|
||||
datacenters = ["neptune", "dathomir", "corrin", "bespin"]
|
||||
type = "system"
|
||||
priority = "100"
|
||||
|
||||
group "collector" {
|
||||
network {
|
||||
port "node_exporter" { static = 9100 }
|
||||
}
|
||||
|
||||
task "node_exporter" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "quay.io/prometheus/node-exporter:v1.8.1"
|
||||
network_mode = "host"
|
||||
volumes = [
|
||||
"/:/host:ro,rslave"
|
||||
]
|
||||
args = [ "--path.rootfs=/host" ]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 50
|
||||
memory = 40
|
||||
}
|
||||
|
||||
service {
|
||||
tags = [ "telemetry" ]
|
||||
port = 9100
|
||||
address_mode = "driver"
|
||||
name = "node-exporter"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/"
|
||||
port = 9100
|
||||
address_mode = "driver"
|
||||
interval = "60s"
|
||||
timeout = "5s"
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "90s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -36,10 +36,15 @@
|
|||
deuxfleurs.wgautomeshPort = 1667;
|
||||
deuxfleurs.services.wgautomesh.logLevel = "debug";
|
||||
|
||||
programs.vim = {
|
||||
enable = true;
|
||||
defaultEditor = true;
|
||||
};
|
||||
|
||||
# Bootstrap IPs for Consul cluster,
|
||||
# these are IPs on the Wireguard overlay
|
||||
services.consul.extraConfig.retry_join = [
|
||||
"10.14.1.3" # caribou
|
||||
"10.14.4.1" # df-pw5
|
||||
"10.14.2.33" # origan
|
||||
"10.14.3.1" # piranha
|
||||
];
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
|
||||
deuxfleurs.hostName = "caribou";
|
||||
deuxfleurs.staticIPv6.address = "2a01:e34:ec05:8a40::23";
|
||||
deuxfleurs.isRaftServer = true;
|
||||
|
||||
# this denote the version at install time, do not update
|
||||
system.stateVersion = "21.05";
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
deuxfleurs.hostName = "piranha";
|
||||
deuxfleurs.staticIPv4.address = "192.168.5.25";
|
||||
deuxfleurs.staticIPv6.address = "2001:912:1ac0:2200::25";
|
||||
deuxfleurs.isRaftServer = true;
|
||||
|
||||
# this denote the version at install time, do not update
|
||||
system.stateVersion = "24.05";
|
||||
|
|
|
@ -26,6 +26,11 @@ if [ "$CLUSTER" = "prod" ]; then
|
|||
message "2. Reboot node manually. You can also take the opportunity to upgrade with:"
|
||||
message " REBOOT_NODES=yes ./upgrade_nixos prod $NIXHOST"
|
||||
message "3. Mark node as eligible again in Nomad"
|
||||
message ""
|
||||
message "If you are certain that the update is not disruptive, you can manually apply changes:"
|
||||
message "1. Connect to node '$NIXHOST' over SSH"
|
||||
message "2. Run this on the node:"
|
||||
message " sudo nixos-rebuild switch"
|
||||
message "-------------------------------------------------------------------------------------"
|
||||
else
|
||||
cmd nixos-rebuild switch
|
||||
|
|
|
@ -47,5 +47,6 @@ ports so that we can avoid conflicts when adding services.
|
|||
8999 opendkim
|
||||
9090 prometheus
|
||||
9100 node_exporter
|
||||
9101 smartctl_exporter
|
||||
9334 tricot metrics
|
||||
9991 guichet
|
||||
|
|
|
@ -68,11 +68,6 @@ SystemMaxUse=1G
|
|||
# Enable support for all terminal emulators such as urxvt
|
||||
environment.enableAllTerminfo = true;
|
||||
|
||||
programs.vim = {
|
||||
enable = true;
|
||||
defaultEditor = true;
|
||||
};
|
||||
|
||||
# Enable network time
|
||||
services.ntp.enable = false;
|
||||
services.timesyncd.enable = true;
|
||||
|
|
|
@ -397,6 +397,7 @@ in
|
|||
{
|
||||
volumes.enabled = true;
|
||||
allow_privileged = true;
|
||||
allow_caps = ["all"];
|
||||
}
|
||||
];
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue