prod: telemetry: Add smartctl_exporter based on staging work
This commit is contained in:
parent
596b7ab966
commit
797f946578
3 changed files with 313 additions and 0 deletions
|
@ -16,6 +16,17 @@ scrape_configs:
|
||||||
cert_file: /etc/prometheus/consul-client.crt
|
cert_file: /etc/prometheus/consul-client.crt
|
||||||
key_file: /etc/prometheus/consul-client.key
|
key_file: /etc/prometheus/consul-client.key
|
||||||
|
|
||||||
|
- job_name: 'smartctl-exporter'
|
||||||
|
scrape_interval: 60s
|
||||||
|
consul_sd_configs:
|
||||||
|
- server: 'https://localhost:8501'
|
||||||
|
services:
|
||||||
|
- 'smartctl-exporter'
|
||||||
|
tls_config:
|
||||||
|
ca_file: /etc/prometheus/consul-ca.crt
|
||||||
|
cert_file: /etc/prometheus/consul-client.crt
|
||||||
|
key_file: /etc/prometheus/consul-client.key
|
||||||
|
|
||||||
- job_name: 'tricot'
|
- job_name: 'tricot'
|
||||||
consul_sd_configs:
|
consul_sd_configs:
|
||||||
- server: 'https://localhost:8501'
|
- server: 'https://localhost:8501'
|
||||||
|
|
170
cluster/prod/app/telemetry/config/smartctl-seccomp.json
Normal file
170
cluster/prod/app/telemetry/config/smartctl-seccomp.json
Normal file
|
@ -0,0 +1,170 @@
|
||||||
|
{
|
||||||
|
"defaultAction": "SCMP_ACT_ERRNO",
|
||||||
|
"defaultErrnoRet": 1,
|
||||||
|
"architectures": [
|
||||||
|
"SCMP_ARCH_X86_64"
|
||||||
|
],
|
||||||
|
"syscalls": [
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"rt_sigaction",
|
||||||
|
"rt_sigprocmask",
|
||||||
|
"getpid",
|
||||||
|
"fcntl",
|
||||||
|
"fstatfs",
|
||||||
|
"gettid",
|
||||||
|
"futex",
|
||||||
|
"getdents64",
|
||||||
|
"epoll_ctl",
|
||||||
|
"tgkill",
|
||||||
|
"openat",
|
||||||
|
"read",
|
||||||
|
"close",
|
||||||
|
"nanosleep",
|
||||||
|
"getsockname",
|
||||||
|
"setsockopt",
|
||||||
|
"chdir",
|
||||||
|
"capget",
|
||||||
|
"prctl",
|
||||||
|
"accept4",
|
||||||
|
"fstat",
|
||||||
|
"getcwd",
|
||||||
|
"setuid",
|
||||||
|
"setgid",
|
||||||
|
"setgroups",
|
||||||
|
"capset",
|
||||||
|
"newfstatat",
|
||||||
|
"write",
|
||||||
|
"writev",
|
||||||
|
"mmap",
|
||||||
|
"brk",
|
||||||
|
"rt_sigreturn",
|
||||||
|
"access",
|
||||||
|
"execve",
|
||||||
|
"getppid",
|
||||||
|
"exit_group",
|
||||||
|
"faccessat2",
|
||||||
|
"mprotect",
|
||||||
|
"pread64",
|
||||||
|
"arch_prctl",
|
||||||
|
"set_tid_address",
|
||||||
|
"set_robust_list",
|
||||||
|
"rseq",
|
||||||
|
"munmap",
|
||||||
|
"madvise",
|
||||||
|
"sigaltstack",
|
||||||
|
"statfs",
|
||||||
|
"waitid",
|
||||||
|
"readlinkat",
|
||||||
|
"eventfd2",
|
||||||
|
"epoll_create1",
|
||||||
|
"pipe2",
|
||||||
|
"pidfd_send_signal",
|
||||||
|
"pidfd_open",
|
||||||
|
"readlink",
|
||||||
|
"epoll_pwait",
|
||||||
|
"dup3",
|
||||||
|
"bind",
|
||||||
|
"listen",
|
||||||
|
"getrliimt",
|
||||||
|
"sched_getaffinity",
|
||||||
|
"sched_yield"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "globally needed by the go runtime"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"open",
|
||||||
|
"uname"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "Used by smartctl"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"ioctl"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "allow SG_IO (aka SCSCI commands) on ioctl as it's what's used to read SMART data",
|
||||||
|
"args": [
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"value": 8837,
|
||||||
|
"op": "SCMP_CMP_EQ"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"ioctl"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "allow NVME_IOCTL_ID command (0x4e40) on ioctl as it's what's used to read data on NVMe devices",
|
||||||
|
"args": [
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"value": 20032,
|
||||||
|
"op": "SCMP_CMP_EQ"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"ioctl"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "allow NVME_IOCTL_ADMIN_CMD command (0xc0484e41) on ioctl as it's what's used to read data on NVMe devices. For some reason, it needs to be encoded as 0xffffffffc0484e41",
|
||||||
|
"args": [
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"value": 18446744072640548417,
|
||||||
|
"op": "SCMP_CMP_EQ"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"ioctl"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ERRNO",
|
||||||
|
"comment": "Debug to allow/deny all ioctl (change to _LOG, _ALLOW, or _ERRNO appropriately)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"clone"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "partially allow clone as per docker config",
|
||||||
|
"args": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"value": 2114060288,
|
||||||
|
"op": "SCMP_CMP_MASKED_EQ"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"clone3"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ERRNO",
|
||||||
|
"comment": "disable clone3 in a specific way as per docker's default config",
|
||||||
|
"errnoRet": 38
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"names": [
|
||||||
|
"socket"
|
||||||
|
],
|
||||||
|
"action": "SCMP_ACT_ALLOW",
|
||||||
|
"comment": "allow IPv4 sockets",
|
||||||
|
"args": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"value": 2,
|
||||||
|
"op": "SCMP_CMP_EQ"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,132 @@
|
||||||
|
job "telemetry-smartctl-exporter" {
|
||||||
|
datacenters = ["neptune", "scorpio", "bespin", "corrin", "dathomir"]
|
||||||
|
type = "system"
|
||||||
|
priority = "100"
|
||||||
|
|
||||||
|
group "smartctl_exporter" {
|
||||||
|
network {
|
||||||
|
port "smartctl_exporter" { static = 9101 }
|
||||||
|
}
|
||||||
|
|
||||||
|
# This init task creates "fake" disk devices. This way, we can
|
||||||
|
# restrict which devices we expose to smartctl_exporter while having
|
||||||
|
# the same task configuration on all hosts.
|
||||||
|
task "create_fake_disks" {
|
||||||
|
driver = "docker"
|
||||||
|
user = "root"
|
||||||
|
|
||||||
|
config {
|
||||||
|
image = "bash:5.2.37"
|
||||||
|
args = [
|
||||||
|
"-x", "${NOMAD_TASK_DIR}/create_fake_disks.sh"
|
||||||
|
]
|
||||||
|
readonly_rootfs = true
|
||||||
|
|
||||||
|
mount {
|
||||||
|
type = "bind"
|
||||||
|
target = "/dev"
|
||||||
|
source = "/dev"
|
||||||
|
readonly = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template {
|
||||||
|
data = <<EOF
|
||||||
|
echo "Checking existing disks and creating fake devices if needed..."
|
||||||
|
[ -a "/dev/sda" ] || ln -s /dev/null /dev/sda
|
||||||
|
[ -a "/dev/sdb" ] || ln -s /dev/null /dev/sdb
|
||||||
|
[ -a "/dev/nvme0" ] || ln -s /dev/null /dev/nvme0
|
||||||
|
EOF
|
||||||
|
destination = "local/create_fake_disks.sh"
|
||||||
|
perms = 755
|
||||||
|
}
|
||||||
|
|
||||||
|
resources {
|
||||||
|
cpu = 10
|
||||||
|
memory = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
hook = "prestart"
|
||||||
|
sidecar = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task "smartctl_exporter" {
|
||||||
|
driver = "docker"
|
||||||
|
# Necessary to use low-level SMART and NVMe commands
|
||||||
|
user = "root"
|
||||||
|
|
||||||
|
config {
|
||||||
|
image = "prometheuscommunity/smartctl-exporter:v0.13.0"
|
||||||
|
args = [
|
||||||
|
"--web.listen-address=0.0.0.0:9101"
|
||||||
|
]
|
||||||
|
network_mode = "host"
|
||||||
|
# CAP_SYS_RAWIO is needed for SMART requests, while CAPS_SYS_ADMIN
|
||||||
|
# is needed for NVMe requests.
|
||||||
|
# These capabilities need to be allowed in the Nomad client config.
|
||||||
|
cap_drop = ["all"]
|
||||||
|
cap_add = ["CAP_SYS_RAWIO", "CAP_SYS_ADMIN"]
|
||||||
|
# Hardening options to avoid running the container as privileged,
|
||||||
|
# while still allowing just enough syscalls so that smartctl can query the disks.
|
||||||
|
security_opt = [
|
||||||
|
"no-new-privileges",
|
||||||
|
# Apparently there is no variable to determine the path to the allocation, hence this hack
|
||||||
|
"seccomp=/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}/local/smartctl-seccomp.json",
|
||||||
|
]
|
||||||
|
readonly_rootfs = true
|
||||||
|
# Sadly, devices must exist for Docker to accept this option, otherwise it fails to run.
|
||||||
|
# This is why we create "fake" devices in the init task above.
|
||||||
|
devices = [
|
||||||
|
{
|
||||||
|
host_path = "/dev/sda"
|
||||||
|
container_path = "/dev/sda"
|
||||||
|
cgroup_permissions = "r"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
host_path = "/dev/sdb"
|
||||||
|
container_path = "/dev/sdb"
|
||||||
|
cgroup_permissions = "r"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
host_path = "/dev/nvme0"
|
||||||
|
container_path = "/dev/nvme0"
|
||||||
|
cgroup_permissions = "r"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
template {
|
||||||
|
data = file("../config/smartctl-seccomp.json")
|
||||||
|
destination = "local/smartctl-seccomp.json"
|
||||||
|
perms = 444
|
||||||
|
}
|
||||||
|
|
||||||
|
resources {
|
||||||
|
cpu = 50
|
||||||
|
memory = 40
|
||||||
|
}
|
||||||
|
|
||||||
|
service {
|
||||||
|
tags = [ "telemetry" ]
|
||||||
|
port = 9101
|
||||||
|
address_mode = "driver"
|
||||||
|
name = "smartctl-exporter"
|
||||||
|
check {
|
||||||
|
type = "http"
|
||||||
|
path = "/"
|
||||||
|
port = 9101
|
||||||
|
address_mode = "driver"
|
||||||
|
interval = "60s"
|
||||||
|
timeout = "5s"
|
||||||
|
check_restart {
|
||||||
|
limit = 3
|
||||||
|
grace = "90s"
|
||||||
|
ignore_warnings = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue