From a7af0c8af98b8009ef9c7047d9f710c3bcf4e41a Mon Sep 17 00:00:00 2001
From: Alex Auvolat <alex@adnab.me>
Date: Wed, 16 Nov 2022 13:27:24 +0100
Subject: [PATCH] Add best practices and doc of monitoring (fix #419)

---
 doc/book/cookbook/monitoring.md               |  305 +++++
 doc/book/cookbook/real-world.md               |   46 +-
 doc/book/cookbook/recovering.md               |    2 +-
 doc/book/cookbook/upgrading.md                |    2 +-
 doc/book/quick-start/_index.md                |    3 +-
 .../grafana-garage-dashboard-prometheus.json  | 1053 +++++++++++++++++
 6 files changed, 1400 insertions(+), 11 deletions(-)
 create mode 100644 doc/book/cookbook/monitoring.md
 create mode 100644 script/telemetry/grafana-garage-dashboard-prometheus.json

diff --git a/doc/book/cookbook/monitoring.md b/doc/book/cookbook/monitoring.md
new file mode 100644
index 00000000..0a3a33e7
--- /dev/null
+++ b/doc/book/cookbook/monitoring.md
@@ -0,0 +1,305 @@
++++
+title = "Monitoring Garage"
+weight = 40
++++
+
+Garage exposes some internal metrics in the Prometheus data format.
+This page explains how to exploit these metrics.
+
+## Setting up monitoring
+
+### Enabling the Admin API endpoint
+
+If you have not already enabled the [administration API endpoint](@/documentation/reference-manual/admin-api.md), do so by adding the following lines to your configuration file:
+
+```toml
+[admin]
+api_bind_addr = "0.0.0.0:3903"
+```
+
+This will allow anyone to scrape Prometheus metrics by fetching
+`http://localhost:3093/metrics`. If you want to restrict access
+to the exported metrics, set the `metrics_token` configuration value
+to a bearer token to be used when fetching the metrics endpoint.
+
+### Setting up Prometheus and Grafana
+
+Add a scrape config to your Prometheus daemon to scrape metrics from
+all of your nodes:
+
+```yaml
+scrape_configs:
+  - job_name: 'garage'
+    static_configs:
+      - targets:
+        - 'node1.mycluster:3903'
+        - 'node2.mycluster:3903'
+        - 'node3.mycluster:3903'
+```
+
+If you have set a metrics token in your Garage configuration file,
+add the following lines in your Prometheus scrape config:
+
+```yaml
+    authorization:
+      type: Bearer
+      credentials: 'your metrics token'
+```
+
+To visualize the scraped data in Grafana,
+you can either import our [Grafana dashboard for Garage](https://git.deuxfleurs.fr/Deuxfleurs/garage/raw/branch/main/script/telemetry/grafana-garage-dashboard-prometheus.json)
+or make your own.
+We detail below the list of exposed metrics and their meaning.
+
+
+
+## List of exported metrics
+
+
+### API endpoints
+
+#### `api_admin_request_counter` (counter)
+
+Counts the number of requests to a given endpoint of the administration API. Example:
+
+```
+api_admin_request_counter{api_endpoint="Metrics"} 127041
+```
+
+#### `api_admin_request_duration` (histogram)
+
+Evaluates the duration of API calls to the various administration API endpoint. Example:
+
+```
+api_admin_request_duration_bucket{api_endpoint="Metrics",le="0.5"} 127041
+api_admin_request_duration_sum{api_endpoint="Metrics"} 605.250344830999
+api_admin_request_duration_count{api_endpoint="Metrics"} 127041
+```
+
+#### `api_s3_request_counter` (counter)
+
+Counts the number of requests to a given endpoint of the S3 API. Example:
+
+```
+api_s3_request_counter{api_endpoint="CreateMultipartUpload"} 1
+```
+
+#### `api_s3_error_counter` (counter)
+
+Counts the number of requests to a given endpoint of the S3 API that returned an error. Example:
+
+```
+api_s3_error_counter{api_endpoint="GetObject",status_code="404"} 39
+```
+
+#### `api_s3_request_duration` (histogram)
+
+Evaluates the duration of API calls to the various S3 API endpoints. Example:
+
+```
+api_s3_request_duration_bucket{api_endpoint="CreateMultipartUpload",le="0.5"} 1
+api_s3_request_duration_sum{api_endpoint="CreateMultipartUpload"} 0.046340762
+api_s3_request_duration_count{api_endpoint="CreateMultipartUpload"} 1
+```
+
+#### `api_k2v_request_counter` (counter), `api_k2v_error_counter` (counter), `api_k2v_error_duration` (histogram)
+
+Same as for S3, for the K2V API.
+
+
+### Web endpoint
+
+
+#### `web_request_counter` (counter)
+
+Number of requests to the web endpoint
+
+```
+web_request_counter{method="GET"} 80
+```
+
+#### `web_request_duration` (histogram)
+
+Duration of requests to the web endpoint
+
+```
+web_request_duration_bucket{method="GET",le="0.5"} 80
+web_request_duration_sum{method="GET"} 1.0528433229999998
+web_request_duration_count{method="GET"} 80
+```
+
+#### `web_error_counter` (counter)
+
+Number of requests to the web endpoint resulting in errors
+
+```
+web_error_counter{method="GET",status_code="404 Not Found"} 64
+```
+
+
+### Data block manager
+
+#### `block_bytes_read`, `block_bytes_written` (counter)
+
+Number of bytes read/written to/from disk in the data storage directory.
+
+```
+block_bytes_read 120586322022
+block_bytes_written 3386618077
+```
+
+#### `block_read_duration`, `block_write_duration` (histograms)
+
+Evaluates the duration of the reading/writing of individual data blocks in the data storage directory.
+
+```
+block_read_duration_bucket{le="0.5"} 169229
+block_read_duration_sum 2761.6902550310056
+block_read_duration_count 169240
+block_write_duration_bucket{le="0.5"} 3559
+block_write_duration_sum 195.59170078500006
+block_write_duration_count 3571
+```
+
+#### `block_delete_counter` (counter)
+
+Counts the number of data blocks that have been deleted from storage.
+
+```
+block_delete_counter 122
+```
+
+#### `block_resync_counter` (counter), `block_resync_duration` (histogram)
+
+Counts the number of resync operations the node has executed, and evaluates their duration.
+
+```
+block_resync_counter 308897
+block_resync_duration_bucket{le="0.5"} 308892
+block_resync_duration_sum 139.64204196100016
+block_resync_duration_count 308897
+```
+
+#### `block_resync_queue_length` (gauge)
+
+The number of block hashes currently queued for a resync.
+This is normal to be nonzero for long periods of time.
+
+```
+block_resync_queue_length 0
+```
+
+#### `block_resync_errored_blocks` (gauge)
+
+The number of block hashes that we were unable to resync last time we tried.
+**THIS SHOULD BE ZERO, OR FALL BACK TO ZERO RAPIDLY, IN A HEALTHY CLUSTER.**
+
+```
+block_resync_errored_blocks 0
+```
+
+
+### RPC (remote procedure calls) between nodes
+
+#### `rpc_netapp_request_counter` (counter)
+
+Number of RPC requests emitted
+
+```
+rpc_request_counter{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 176
+```
+
+#### `rpc_netapp_error_counter` (counter)
+
+Number of communication errors (errors in the Netapp library)
+
+```
+rpc_netapp_error_counter{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 354
+```
+
+#### `rpc_timeout_counter` (counter)
+
+Number of RPC timeouts
+
+```
+rpc_timeout_counter{from="<this node>",rpc_endpoint="garage_rpc/membership.rs/SystemRpc",to="<remote node>"} 1
+```
+
+#### `rpc_duration` (histogram)
+
+The duration of internal RPC calls between Garage nodes.
+
+```
+rpc_duration_bucket{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>",le="0.5"} 166
+rpc_duration_sum{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 35.172253716
+rpc_duration_count{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 174
+```
+
+
+### Metadata table manager
+
+#### `table_gc_todo_queue_length` (gauge)
+
+Table garbage collector TODO queue length
+
+```
+table_gc_todo_queue_length{table_name="block_ref"} 0
+```
+
+#### `table_get_request_counter` (counter), `table_get_request_duration` (histogram)
+
+Number of get/get_range requests internally made on each table, and their duration.
+
+```
+table_get_request_counter{table_name="bucket_alias"} 315
+table_get_request_duration_bucket{table_name="bucket_alias",le="0.5"} 315
+table_get_request_duration_sum{table_name="bucket_alias"} 0.048509778000000024
+table_get_request_duration_count{table_name="bucket_alias"} 315
+```
+
+
+#### `table_put_request_counter` (counter), `table_put_request_duration` (histogram)
+
+Number of insert/insert_many requests internally made on this table, and their duration
+
+```
+table_put_request_counter{table_name="block_ref"} 677
+table_put_request_duration_bucket{table_name="block_ref",le="0.5"} 677
+table_put_request_duration_sum{table_name="block_ref"} 61.617528636
+table_put_request_duration_count{table_name="block_ref"} 677
+```
+
+#### `table_internal_delete_counter` (counter)
+
+Number of value deletions in the tree (due to GC or repartitioning)
+
+```
+table_internal_delete_counter{table_name="block_ref"} 2296
+```
+
+#### `table_internal_update_counter` (counter)
+
+Number of value updates where the value actually changes (includes creation of new key and update of existing key)
+
+```
+table_internal_update_counter{table_name="block_ref"} 5996
+```
+
+#### `table_merkle_updater_todo_queue_length` (gauge)
+
+Merkle tree updater TODO queue length (should fall to zero rapidly)
+
+```
+table_merkle_updater_todo_queue_length{table_name="block_ref"} 0
+```
+
+#### `table_sync_items_received`, `table_sync_items_sent` (counters)
+
+Number of data items sent to/recieved from other nodes during resync procedures
+
+```
+table_sync_items_received{from="<remote node>",table_name="bucket_v2"} 3
+table_sync_items_sent{table_name="block_ref",to="<remote node>"} 2
+```
+
+
diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md
index 4fcb5cf7..6fe7e547 100644
--- a/doc/book/cookbook/real-world.md
+++ b/doc/book/cookbook/real-world.md
@@ -11,8 +11,9 @@ We recommend first following the [quick start guide](@/documentation/quick-start
 to get familiar with Garage's command line and usage patterns.
 
 
+## Preparing your environment
 
-## Prerequisites
+### Prerequisites
 
 To run a real-world deployment, make sure the following conditions are met:
 
@@ -21,10 +22,6 @@ To run a real-world deployment, make sure the following conditions are met:
 - Each machine has a public IP address which is reachable by other machines.
   Running behind a NAT is likely to be possible but hasn't been tested for the latest version (TODO).
 
-- Ideally, each machine should have a SSD available in addition to the HDD you are dedicating
-  to Garage. This will allow for faster access to metadata and has the potential
-  to significantly reduce Garage's response times.
-
 - This guide will assume you are using Docker containers to deploy Garage on each node. 
   Garage can also be run independently, for instance as a [Systemd service](@/documentation/cookbook/systemd.md).
   You can also use an orchestrator such as Nomad or Kubernetes to automatically manage
@@ -49,6 +46,42 @@ available in the different locations of your cluster is roughly the same.
 For instance, here, the Mercury node could be moved to Brussels; this would allow the cluster
 to store 2 TB of data in total.
 
+### Best practices
+
+- If you have fast dedicated networking between all your nodes, and are planing to store
+  very large files, bump the `block_size` configuration parameter to 10 MB
+  (`block_size = 10485760`).
+
+- Garage stores its files in two locations: it uses a metadata directory to store frequently-accessed
+  small metadata items, and a data directory to store data blocks of uploaded objects.
+  Ideally, the metadata directory would be stored on an SSD (smaller but faster),
+  and the data directory would be stored on an HDD (larger but slower).
+
+- For the data directory, Garage already does checksumming and integrity verification,
+  so there is no need to use a filesystem such as BTRFS or ZFS that does it.
+  We recommend using XFS for the data partition, as it has the best performance.
+  Ext4 is not recommended as it has more strict limitations on the number of inodes,
+  which might cause issues with Garage when large numbers of objects are stored.
+
+- If you only have an HDD and no SSD, it's fine to put your metadata alongside the data
+  on the same drive. Having lots of RAM for your kernel to cache the metadata will
+  help a lot with performance.  Make sure to use the LMDB database engine,
+  instead of Sled, which suffers from quite bad performance degradation on HDDs.
+  Sled is still the default for legacy reasons, but is not recommended anymore.
+
+- For the metadata storage, Garage does not do checksumming and integrity
+  verification on its own. If you are afraid of bitrot/data corruption,
+  put your metadata directory on a BTRFS partition. Otherwise, just use regular
+  EXT4 or XFS.
+
+- Having a single server with several storage drives is currently not very well
+  supported in Garage ([#218](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/218)).
+  For an easy setup, just put all your drives in a RAID0 or a ZFS RAIDZ array.
+  If you're adventurous, you can try to format each of your disk as
+  a separate XFS partition, and then run one `garage` daemon per disk drive,
+  or use something like [`mergerfs`](https://github.com/trapexit/mergerfs) to merge
+  all your disks in a single union filesystem that spreads load over them.
+
 ## Get a Docker image
 
 Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated).
@@ -81,6 +114,7 @@ A valid `/etc/garage/garage.toml` for our cluster would look as follows:
 ```toml
 metadata_dir = "/var/lib/garage/meta"
 data_dir = "/var/lib/garage/data"
+db_engine = "lmdb"
 
 replication_mode = "3"
 
@@ -90,8 +124,6 @@ rpc_bind_addr = "[::]:3901"
 rpc_public_addr = "<this node's public IP>:3901"
 rpc_secret = "<RPC secret>"
 
-bootstrap_peers = []
-
 [s3_api]
 s3_region = "garage"
 api_bind_addr = "[::]:3900"
diff --git a/doc/book/cookbook/recovering.md b/doc/book/cookbook/recovering.md
index 2424558c..2129a7f3 100644
--- a/doc/book/cookbook/recovering.md
+++ b/doc/book/cookbook/recovering.md
@@ -1,6 +1,6 @@
 +++
 title = "Recovering from failures"
-weight = 35
+weight = 50
 +++
 
 Garage is meant to work on old, second-hand hardware.
diff --git a/doc/book/cookbook/upgrading.md b/doc/book/cookbook/upgrading.md
index f7659921..9f2ba73b 100644
--- a/doc/book/cookbook/upgrading.md
+++ b/doc/book/cookbook/upgrading.md
@@ -1,6 +1,6 @@
 +++
 title = "Upgrading Garage"
-weight = 40
+weight = 60
 +++
 
 Garage is a stateful clustered application, where all nodes are communicating together and share data structures.
diff --git a/doc/book/quick-start/_index.md b/doc/book/quick-start/_index.md
index 03c542f9..ac55d2f7 100644
--- a/doc/book/quick-start/_index.md
+++ b/doc/book/quick-start/_index.md
@@ -54,6 +54,7 @@ to generate unique and private secrets for security reasons:
 cat > garage.toml <<EOF
 metadata_dir = "/tmp/meta"
 data_dir = "/tmp/data"
+db_engine = "lmdb"
 
 replication_mode = "none"
 
@@ -61,8 +62,6 @@ rpc_bind_addr = "[::]:3901"
 rpc_public_addr = "127.0.0.1:3901"
 rpc_secret = "$(openssl rand -hex 32)"
 
-bootstrap_peers = []
-
 [s3_api]
 s3_region = "garage"
 api_bind_addr = "[::]:3900"
diff --git a/script/telemetry/grafana-garage-dashboard-prometheus.json b/script/telemetry/grafana-garage-dashboard-prometheus.json
new file mode 100644
index 00000000..28ef1ec0
--- /dev/null
+++ b/script/telemetry/grafana-garage-dashboard-prometheus.json
@@ -0,0 +1,1053 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_DS_PROMETHEUS",
+      "label": "DS_PROMETHEUS",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "9.2.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 24,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "id": 10,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "sum(rate(block_bytes_read{job=\"garage\"}[$__rate_interval])    )",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Disk bytes read",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "-sum(rate(block_bytes_written{job=\"garage\"}[$__rate_interval])    )",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Disk bytes written",
+          "refId": "B"
+        }
+      ],
+      "title": "Disk I/O",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "reqps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 8,
+        "y": 0
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum by (api_endpoint) (rate(api_s3_request_counter {job=\"garage\"}[$__rate_interval]))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{api_endpoint}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "API requests",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "reqps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 16,
+        "y": 0
+      },
+      "id": 9,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "sum(rate(web_request_counter {job=\"garage\"}[$__rate_interval]))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Web request rate",
+          "refId": "A"
+        }
+      ],
+      "title": "Web requests",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "reqps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 9
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "sum by (rpc_endpoint) (rate(rpc_request_counter {job=\"garage\"}[$__rate_interval]))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{rpc_endpoint}}",
+          "refId": "A"
+        }
+      ],
+      "title": "RPC requests",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "reqps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 9
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum by (api_endpoint, status_code) (rate(api_s3_error_counter {job=\"garage\"}[$__rate_interval]))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{api_endpoint}} {{status_code}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "API errors",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "reqps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 9
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "sum by(status_code) (rate(web_error_counter {job=\"garage\"}[$__rate_interval]))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{status_code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Web errors",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "__systemRef": "hideSeriesFrom",
+            "matcher": {
+              "id": "byNames",
+              "options": {
+                "mode": "exclude",
+                "names": [
+                  "10.83.2.3:3903"
+                ],
+                "prefix": "All except:",
+                "readOnly": true
+              }
+            },
+            "properties": [
+              {
+                "id": "custom.hideFrom",
+                "value": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": true
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 17
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "block_resync_queue_length{job=\"garage\"}",
+          "interval": "",
+          "legendFormat": "{{instance}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Resync queue length",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 17
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "sum by(table_name) (table_gc_todo_queue_length{job=\"garage\"})",
+          "interval": "",
+          "legendFormat": "{{ table_name}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Table GC queue length",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 17
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "sum by(table_name) (table_merkle_updater_todo_queue_length{job=\"garage\"})",
+          "interval": "",
+          "legendFormat": "{{ table_name}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Table Merkle updater queue length",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 25
+      },
+      "id": 12,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_DS_PROMETHEUS}"
+          },
+          "exemplar": true,
+          "expr": "block_resync_errored_blocks{job=\"garage\"}",
+          "interval": "",
+          "legendFormat": "{{instance}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Resync errored blocks",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 37,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Garage",
+  "uid": "ys3pnpZ4k",
+  "version": 26,
+  "weekStart": ""
+}
\ No newline at end of file