Garage v0.9 #473

Merged
lx merged 175 commits from next into main 2023-10-10 13:28:29 +00:00
4 changed files with 51 additions and 23 deletions
Showing only changes of commit 35c108b85d - Show all commits

View file

@ -52,7 +52,7 @@ Returns an HTTP status 200 if the node is ready to answer user's requests,
and an HTTP status 503 (Service Unavailable) if there are some partitions
for which a quorum of nodes is not available.
A simple textual message is also returned in a body with content-type `text/plain`.
See `/v0/health` for an API that also returns JSON output.
See `/v1/health` for an API that also returns JSON output.
### Cluster operations
@ -161,21 +161,21 @@ Example response body:
}
```
#### GetClusterHealth `GET /v0/health`
#### GetClusterHealth `GET /v1/health`
Returns the cluster's current health in JSON format, with the following variables:
- `status`: one of `Healthy`, `Degraded` or `Unavailable`:
- Healthy: Garage node is connected to all storage nodes
- Degraded: Garage node is not connected to all storage nodes, but a quorum of write nodes is available for all partitions
- Unavailable: a quorum of write nodes is not available for some partitions
- `known_nodes`: the number of nodes this Garage node has had a TCP connection to since the daemon started
- `connected_nodes`: the nubmer of nodes this Garage node currently has an open connection to
- `storage_nodes`: the number of storage nodes currently registered in the cluster layout
- `storage_nodes_ok`: the number of storage nodes to which a connection is currently open
- `status`: one of `healthy`, `degraded` or `unavailable`:
- healthy: Garage node is connected to all storage nodes
- degraded: Garage node is not connected to all storage nodes, but a quorum of write nodes is available for all partitions
- unavailable: a quorum of write nodes is not available for some partitions
- `knownNodes`: the number of nodes this Garage node has had a TCP connection to since the daemon started
- `connectedNodes`: the nubmer of nodes this Garage node currently has an open connection to
- `storageNodes`: the number of storage nodes currently registered in the cluster layout
- `storageNodesOk`: the number of storage nodes to which a connection is currently open
- `partitions`: the total number of partitions of the data (currently always 256)
- `partitions_quorum`: the number of partitions for which a quorum of write nodes is available
- `partitions_all_ok`: the number of partitions for which we are connected to all storage nodes responsible of storing it
- `partitionsQuorum`: the number of partitions for which a quorum of write nodes is available
- `partitionsAllOk`: the number of partitions for which we are connected to all storage nodes responsible of storing it
Contrarily to `GET /health`, this endpoint always returns a 200 OK HTTP response code.
@ -183,14 +183,14 @@ Example response body:
```json
{
"status": "Degraded",
"known_nodes": 3,
"connected_nodes": 2,
"storage_nodes": 3,
"storage_nodes_ok": 2,
"partitions": 256,
"partitions_quorum": 256,
"partitions_all_ok": 0
"status": "degraded",
"knownNodes": 3,
"connectedNodes": 3,
"storageNodes": 4,
"storageNodesOk": 3,
"partitions": 256,
"partitionsQuorum": 256,
"partitionsAllOk": 64
}
```

View file

@ -40,7 +40,22 @@ pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<
}
pub async fn handle_get_cluster_health(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
use garage_rpc::system::ClusterHealthStatus;
let health = garage.system.health();
let health = ClusterHealth {
status: match health.status {
ClusterHealthStatus::Healthy => "healthy",
ClusterHealthStatus::Degraded => "degraded",
ClusterHealthStatus::Unavailable => "unavailable",
},
known_nodes: health.known_nodes,
connected_nodes: health.connected_nodes,
storage_nodes: health.storage_nodes,
storage_nodes_ok: health.storage_nodes_ok,
partitions: health.partitions,
partitions_quorum: health.partitions_quorum,
partitions_all_ok: health.partitions_all_ok,
};
Ok(json_ok_response(&health)?)
}
@ -120,6 +135,19 @@ fn get_cluster_layout(garage: &Arc<Garage>) -> GetClusterLayoutResponse {
// ----
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterHealth {
pub status: &'static str,
pub known_nodes: usize,
pub connected_nodes: usize,
pub storage_nodes: usize,
pub storage_nodes_ok: usize,
pub partitions: usize,
pub partitions_quorum: usize,
pub partitions_all_ok: usize,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct GetClusterStatusResponse {

View file

@ -96,7 +96,7 @@ impl Endpoint {
GET "/health" => Health,
GET "/metrics" => Metrics,
GET "/v1/status" => GetClusterStatus,
GET "/v0/health" => GetClusterHealth,
GET "/v1/health" => GetClusterHealth,
POST "/v0/connect" => ConnectClusterNodes,
// Layout endpoints
GET "/v1/layout" => GetClusterLayout,

View file

@ -151,7 +151,7 @@ pub struct KnownNodeInfo {
pub status: NodeStatus,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy)]
pub struct ClusterHealth {
/// The current health status of the cluster (see below)
pub status: ClusterHealthStatus,
@ -171,7 +171,7 @@ pub struct ClusterHealth {
pub partitions_all_ok: usize,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy)]
pub enum ClusterHealthStatus {
/// All nodes are available
Healthy,