Implement /health admin API endpoint to check node health #440

Merged
lx merged 5 commits from admin-health-api into main 2022-12-11 17:25:29 +00:00
3 changed files with 33 additions and 48 deletions
Showing only changes of commit d7868c48a4 - Show all commits

View file

@ -1,4 +1,3 @@
use std::fmt::Write;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::Arc; use std::sync::Arc;
@ -78,7 +77,7 @@ impl AdminApiServer {
.body(Body::empty())?) .body(Body::empty())?)
} }
fn handle_health(&self, format: Option<&str>) -> Result<Response<Body>, Error> { fn handle_health(&self) -> Result<Response<Body>, Error> {
let health = self.garage.system.health(); let health = self.garage.system.health();
let (status, status_str) = match health.status { let (status, status_str) = match health.status {
@ -92,47 +91,15 @@ impl AdminApiServer {
"Quorum is not available for some/all partitions, reads and writes will fail", "Quorum is not available for some/all partitions, reads and writes will fail",
), ),
}; };
let status_str = format!(
"{}\nConsult the full health check API endpoint at /v0/health for more details\n",
status_str
);
let resp = Response::builder().status(status); Ok(Response::builder()
.status(status)
if matches!(format, Some("json")) { .header(http::header::CONTENT_TYPE, "text/plain")
let resp_json = .body(Body::from(status_str))?)
serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?;
Ok(resp
.header(http::header::CONTENT_TYPE, "application/json")
.body(Body::from(resp_json))?)
} else {
let mut buf = status_str.to_string();
writeln!(
&mut buf,
"\nAll nodes: {} connected, {} known",
health.connected_nodes, health.known_nodes,
)
.unwrap();
writeln!(
&mut buf,
"Storage nodes: {} connected, {} in layout",
health.storage_nodes_ok, health.storage_nodes
)
.unwrap();
writeln!(&mut buf, "Number of partitions: {}", health.partitions).unwrap();
writeln!(
&mut buf,
"Partitions with quorum: {}",
health.partitions_quorum
)
.unwrap();
writeln!(
&mut buf,
"Partitions with all nodes available: {}",
health.partitions_all_ok
)
.unwrap();
Ok(resp
.header(http::header::CONTENT_TYPE, "text/plain")
.body(Body::from(buf))?)
}
} }
fn handle_metrics(&self) -> Result<Response<Body>, Error> { fn handle_metrics(&self) -> Result<Response<Body>, Error> {
@ -207,9 +174,10 @@ impl ApiHandler for AdminApiServer {
match endpoint { match endpoint {
Endpoint::Options => self.handle_options(&req), Endpoint::Options => self.handle_options(&req),
Endpoint::Health { format } => self.handle_health(format.as_deref()), Endpoint::Health => self.handle_health(),
Endpoint::Metrics => self.handle_metrics(), Endpoint::Metrics => self.handle_metrics(),
Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await, Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await,
Endpoint::GetClusterHealth => handle_get_cluster_health(&self.garage).await,
Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await, Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await,
// Layout // Layout
Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await,

View file

@ -9,6 +9,7 @@ use garage_util::crdt::*;
use garage_util::data::*; use garage_util::data::*;
use garage_rpc::layout::*; use garage_rpc::layout::*;
use garage_rpc::system::ClusterHealthStatus;
use garage_model::garage::Garage; use garage_model::garage::Garage;
@ -43,6 +44,22 @@ pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<
Ok(json_ok_response(&res)?) Ok(json_ok_response(&res)?)
} }
pub async fn handle_get_cluster_health(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
let health = garage.system.health();
let status = match health.status {
ClusterHealthStatus::Unavailable => StatusCode::SERVICE_UNAVAILABLE,
_ => StatusCode::OK,
};
let resp_json =
serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?;
Ok(Response::builder()
.status(status)
.header(http::header::CONTENT_TYPE, "application/json")
.body(Body::from(resp_json))?)
}
pub async fn handle_connect_cluster_nodes( pub async fn handle_connect_cluster_nodes(
garage: &Arc<Garage>, garage: &Arc<Garage>,
req: Request<Body>, req: Request<Body>,

View file

@ -17,11 +17,10 @@ router_match! {@func
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum Endpoint { pub enum Endpoint {
Options, Options,
Health { Health,
format: Option<String>,
},
Metrics, Metrics,
GetClusterStatus, GetClusterStatus,
GetClusterHealth,
ConnectClusterNodes, ConnectClusterNodes,
// Layout // Layout
GetClusterLayout, GetClusterLayout,
@ -92,9 +91,10 @@ impl Endpoint {
let res = router_match!(@gen_path_parser (req.method(), path, query) [ let res = router_match!(@gen_path_parser (req.method(), path, query) [
OPTIONS _ => Options, OPTIONS _ => Options,
GET "/health" => Health (query_opt::format), GET "/health" => Health,
GET "/metrics" => Metrics, GET "/metrics" => Metrics,
GET "/v0/status" => GetClusterStatus, GET "/v0/status" => GetClusterStatus,
GET "/v0/health" => GetClusterHealth,
POST "/v0/connect" => ConnectClusterNodes, POST "/v0/connect" => ConnectClusterNodes,
// Layout endpoints // Layout endpoints
GET "/v0/layout" => GetClusterLayout, GET "/v0/layout" => GetClusterLayout,
@ -135,7 +135,7 @@ impl Endpoint {
/// Get the kind of authorization which is required to perform the operation. /// Get the kind of authorization which is required to perform the operation.
pub fn authorization_type(&self) -> Authorization { pub fn authorization_type(&self) -> Authorization {
match self { match self {
Self::Health { .. } => Authorization::None, Self::Health => Authorization::None,
Self::Metrics => Authorization::MetricsToken, Self::Metrics => Authorization::MetricsToken,
_ => Authorization::AdminToken, _ => Authorization::AdminToken,
} }