staging: telemetry: Use a init task to create fake disk devices for smartctl_exporter

2025-03-24 17:47:05 +01:00 · 2025-03-24 17:47:05 +01:00 · ec1fa3e540
commit ec1fa3e540
parent 67230dd60c
1 changed files with 57 additions and 3 deletions
--- a/cluster/staging/app/telemetry/deploy/telemetry-smartctl-exporter.hcl
+++ b/cluster/staging/app/telemetry/deploy/telemetry-smartctl-exporter.hcl
@ -8,6 +8,50 @@ job "telemetry-smartctl-exporter" {
      port "smartctl_exporter" { static = 9101 }
    }

+    # This init task creates "fake" disk devices.  This way, we can
+    # restrict which devices we expose to smartctl_exporter while having
+    # the same task configuration on all hosts.
+    task "create_fake_disks" {
+      driver = "docker"
+      user = "root"
+
+      config {
+        image = "bash:5.2.37"
+        args = [
+          "-x", "${NOMAD_TASK_DIR}/create_fake_disks.sh"
+        ]
+        readonly_rootfs = true
+
+        mount {
+          type = "bind"
+          target = "/dev"
+          source = "/dev"
+          readonly = false
+        }
+      }
+
+      template {
+        data = <<EOF
+          echo "Checking existing disks and creating fake devices if needed..."
+          [ -a "/dev/sda" ] || ln -s /dev/null /dev/sda
+          [ -a "/dev/sdb" ] || ln -s /dev/null /dev/sdb
+          [ -a "/dev/nvme0" ] || ln -s /dev/null /dev/nvme0
+        EOF
+        destination = "local/create_fake_disks.sh"
+        perms = 755
+      }
+
+      resources {
+        cpu = 10
+        memory = 10
+      }
+
+      lifecycle {
+        hook    = "prestart"
+        sidecar = false
+      }
+    }
+
    task "smartctl_exporter" {
      driver = "docker"
      # Necessary to use low-level SMART and NVMe commands
@ -21,6 +65,7 @@ job "telemetry-smartctl-exporter" {
        network_mode = "host"
        # CAP_SYS_RAWIO is needed for SMART requests, while CAPS_SYS_ADMIN
        # is needed for NVMe requests.
+        # These capabilities need to be allowed in the Nomad client config.
        cap_drop = ["all"]
        cap_add = ["CAP_SYS_RAWIO", "CAP_SYS_ADMIN"]
        # Hardening options to avoid running the container as privileged,
@ -31,14 +76,23 @@ job "telemetry-smartctl-exporter" {
          "seccomp=/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}/local/smartctl-seccomp.json",
        ]
        readonly_rootfs = true
-        # Sadly, devices must exist for Docker to accept this option, so
-        # we can't declare all possible devices.
-        # This may help: https://docs.docker.com/reference/cli/docker/container/run/#device-cgroup-rule
+        # Sadly, devices must exist for Docker to accept this option, otherwise it fails to run.
+        # This is why we create "fake" devices in the init task above.
        devices = [
          {
            host_path = "/dev/sda"
            container_path = "/dev/sda"
            cgroup_permissions = "r"
+          },
+          {
+            host_path = "/dev/sdb"
+            container_path = "/dev/sdb"
+            cgroup_permissions = "r"
+          },
+          {
+            host_path = "/dev/nvme0"
+            container_path = "/dev/nvme0"
+            cgroup_permissions = "r"
          }
        ]
      }