Garage v0.9 #473

Merged
lx merged 175 commits from next into main 2023-10-10 13:28:29 +00:00
4 changed files with 51 additions and 23 deletions
Showing only changes of commit 35c108b85d - Show all commits

View file

@ -52,7 +52,7 @@ Returns an HTTP status 200 if the node is ready to answer user's requests,
and an HTTP status 503 (Service Unavailable) if there are some partitions and an HTTP status 503 (Service Unavailable) if there are some partitions
for which a quorum of nodes is not available. for which a quorum of nodes is not available.
A simple textual message is also returned in a body with content-type `text/plain`. A simple textual message is also returned in a body with content-type `text/plain`.
See `/v0/health` for an API that also returns JSON output. See `/v1/health` for an API that also returns JSON output.
### Cluster operations ### Cluster operations
@ -161,21 +161,21 @@ Example response body:
} }
``` ```
#### GetClusterHealth `GET /v0/health` #### GetClusterHealth `GET /v1/health`
Returns the cluster's current health in JSON format, with the following variables: Returns the cluster's current health in JSON format, with the following variables:
- `status`: one of `Healthy`, `Degraded` or `Unavailable`: - `status`: one of `healthy`, `degraded` or `unavailable`:
- Healthy: Garage node is connected to all storage nodes - healthy: Garage node is connected to all storage nodes
- Degraded: Garage node is not connected to all storage nodes, but a quorum of write nodes is available for all partitions - degraded: Garage node is not connected to all storage nodes, but a quorum of write nodes is available for all partitions
- Unavailable: a quorum of write nodes is not available for some partitions - unavailable: a quorum of write nodes is not available for some partitions
- `known_nodes`: the number of nodes this Garage node has had a TCP connection to since the daemon started - `knownNodes`: the number of nodes this Garage node has had a TCP connection to since the daemon started
- `connected_nodes`: the nubmer of nodes this Garage node currently has an open connection to - `connectedNodes`: the nubmer of nodes this Garage node currently has an open connection to
- `storage_nodes`: the number of storage nodes currently registered in the cluster layout - `storageNodes`: the number of storage nodes currently registered in the cluster layout
- `storage_nodes_ok`: the number of storage nodes to which a connection is currently open - `storageNodesOk`: the number of storage nodes to which a connection is currently open
- `partitions`: the total number of partitions of the data (currently always 256) - `partitions`: the total number of partitions of the data (currently always 256)
- `partitions_quorum`: the number of partitions for which a quorum of write nodes is available - `partitionsQuorum`: the number of partitions for which a quorum of write nodes is available
- `partitions_all_ok`: the number of partitions for which we are connected to all storage nodes responsible of storing it - `partitionsAllOk`: the number of partitions for which we are connected to all storage nodes responsible of storing it
Contrarily to `GET /health`, this endpoint always returns a 200 OK HTTP response code. Contrarily to `GET /health`, this endpoint always returns a 200 OK HTTP response code.
@ -183,14 +183,14 @@ Example response body:
```json ```json
{ {
"status": "Degraded", "status": "degraded",
"known_nodes": 3, "knownNodes": 3,
"connected_nodes": 2, "connectedNodes": 3,
"storage_nodes": 3, "storageNodes": 4,
"storage_nodes_ok": 2, "storageNodesOk": 3,
"partitions": 256, "partitions": 256,
"partitions_quorum": 256, "partitionsQuorum": 256,
"partitions_all_ok": 0 "partitionsAllOk": 64
} }
``` ```

View file

@ -40,7 +40,22 @@ pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<
} }
pub async fn handle_get_cluster_health(garage: &Arc<Garage>) -> Result<Response<Body>, Error> { pub async fn handle_get_cluster_health(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
use garage_rpc::system::ClusterHealthStatus;
let health = garage.system.health(); let health = garage.system.health();
let health = ClusterHealth {
status: match health.status {
ClusterHealthStatus::Healthy => "healthy",
ClusterHealthStatus::Degraded => "degraded",
ClusterHealthStatus::Unavailable => "unavailable",
},
known_nodes: health.known_nodes,
connected_nodes: health.connected_nodes,
storage_nodes: health.storage_nodes,
storage_nodes_ok: health.storage_nodes_ok,
partitions: health.partitions,
partitions_quorum: health.partitions_quorum,
partitions_all_ok: health.partitions_all_ok,
};
Ok(json_ok_response(&health)?) Ok(json_ok_response(&health)?)
} }
@ -120,6 +135,19 @@ fn get_cluster_layout(garage: &Arc<Garage>) -> GetClusterLayoutResponse {
// ---- // ----
#[derive(Debug, Clone, Copy, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterHealth {
pub status: &'static str,
pub known_nodes: usize,
pub connected_nodes: usize,
pub storage_nodes: usize,
pub storage_nodes_ok: usize,
pub partitions: usize,
pub partitions_quorum: usize,
pub partitions_all_ok: usize,
}
#[derive(Serialize)] #[derive(Serialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct GetClusterStatusResponse { struct GetClusterStatusResponse {

View file

@ -96,7 +96,7 @@ impl Endpoint {
GET "/health" => Health, GET "/health" => Health,
GET "/metrics" => Metrics, GET "/metrics" => Metrics,
GET "/v1/status" => GetClusterStatus, GET "/v1/status" => GetClusterStatus,
GET "/v0/health" => GetClusterHealth, GET "/v1/health" => GetClusterHealth,
POST "/v0/connect" => ConnectClusterNodes, POST "/v0/connect" => ConnectClusterNodes,
// Layout endpoints // Layout endpoints
GET "/v1/layout" => GetClusterLayout, GET "/v1/layout" => GetClusterLayout,

View file

@ -151,7 +151,7 @@ pub struct KnownNodeInfo {
pub status: NodeStatus, pub status: NodeStatus,
} }
#[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Debug, Clone, Copy)]
pub struct ClusterHealth { pub struct ClusterHealth {
/// The current health status of the cluster (see below) /// The current health status of the cluster (see below)
pub status: ClusterHealthStatus, pub status: ClusterHealthStatus,
@ -171,7 +171,7 @@ pub struct ClusterHealth {
pub partitions_all_ok: usize, pub partitions_all_ok: usize,
} }
#[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Debug, Clone, Copy)]
pub enum ClusterHealthStatus { pub enum ClusterHealthStatus {
/// All nodes are available /// All nodes are available
Healthy, Healthy,