admin: more info in admin GetClusterStatus

This commit is contained in:
Alex 2023-11-28 14:25:04 +01:00
parent 539af6eac4
commit c04dd8788a
Signed by untrusted user: lx
GPG key ID: 0E496D15096376BE
5 changed files with 192 additions and 96 deletions

View file

@ -69,8 +69,8 @@ Example response body:
```json ```json
{ {
"node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"garageVersion": "git:v0.9.0-dev", "garageVersion": "v0.10.0",
"garageFeatures": [ "garageFeatures": [
"k2v", "k2v",
"sled", "sled",
@ -81,84 +81,93 @@ Example response body:
], ],
"rustVersion": "1.68.0", "rustVersion": "1.68.0",
"dbEngine": "LMDB (using Heed crate)", "dbEngine": "LMDB (using Heed crate)",
"knownNodes": [ "layoutVersion": 5,
"nodes": [
{ {
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
"addr": "10.0.0.11:3901", "role": {
"isUp": true, "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
"lastSeenSecsAgo": 9,
"hostname": "node1"
},
{
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
"addr": "10.0.0.12:3901",
"isUp": true,
"lastSeenSecsAgo": 1,
"hostname": "node2"
},
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"addr": "10.0.0.21:3901",
"isUp": true,
"lastSeenSecsAgo": 7,
"hostname": "node3"
},
{
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
"addr": "10.0.0.22:3901",
"isUp": true,
"lastSeenSecsAgo": 1,
"hostname": "node4"
}
],
"layout": {
"version": 12,
"roles": [
{
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
"zone": "dc1", "zone": "dc1",
"capacity": 10737418240, "capacity": 100000000000,
"tags": [ "tags": []
"node1" },
] "addr": "10.0.0.3:3901",
"hostname": "node3",
"isUp": true,
"lastSeenSecsAgo": 12,
"draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
}, },
{ {
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff", "id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f",
"role": null,
"addr": "10.0.0.2:3901",
"hostname": "node2",
"isUp": true,
"lastSeenSecsAgo": 11,
"draining": true,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
},
{
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
"role": {
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
"zone": "dc1", "zone": "dc1",
"capacity": 10737418240, "capacity": 100000000000,
"tags": [ "tags": []
"node2" },
] "addr": "127.0.0.1:3904",
"hostname": "lindy",
"isUp": true,
"lastSeenSecsAgo": 2,
"draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
}, },
{ {
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"zone": "dc2", "role": {
"capacity": 10737418240, "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"tags": [ "zone": "dc1",
"node3" "capacity": 100000000000,
] "tags": []
},
"addr": "10.0.0.1:3901",
"hostname": "node1",
"isUp": true,
"lastSeenSecsAgo": 3,
"draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
} }
],
"stagedRoleChanges": [
{
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
"remove": false,
"zone": "dc2",
"capacity": 10737418240,
"tags": [
"node4"
]
}
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"remove": true,
"zone": null,
"capacity": null,
"tags": null,
} }
] ]
} }
}
``` ```
#### GetClusterHealth `GET /v1/health` #### GetClusterHealth `GET /v1/health`

View file

@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::Arc; use std::sync::Arc;
@ -15,25 +16,95 @@ use crate::admin::error::*;
use crate::helpers::{json_ok_response, parse_json_body}; use crate::helpers::{json_ok_response, parse_json_body};
pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<Body>, Error> { pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
let layout = garage.system.cluster_layout();
let mut nodes = garage
.system
.get_known_nodes()
.into_iter()
.map(|i| {
(
i.id,
NodeResp {
id: hex::encode(i.id),
addr: Some(i.addr),
hostname: i.status.hostname,
is_up: i.is_up,
last_seen_secs_ago: i.last_seen_secs_ago,
data_partition: i
.status
.data_disk_avail
.map(|(avail, total)| FreeSpaceResp {
available: avail,
total,
}),
metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| {
FreeSpaceResp {
available: avail,
total,
}
}),
..Default::default()
},
)
})
.collect::<HashMap<_, _>>();
for (id, _, role) in layout.current().roles.items().iter() {
if let layout::NodeRoleV(Some(r)) = role {
let role = NodeRoleResp {
id: hex::encode(id),
zone: r.zone.to_string(),
capacity: r.capacity,
tags: r.tags.clone(),
};
match nodes.get_mut(id) {
None => {
nodes.insert(
*id,
NodeResp {
id: hex::encode(id),
role: Some(role),
..Default::default()
},
);
}
Some(n) => {
if n.role.is_none() {
n.role = Some(role);
}
}
}
}
}
for ver in layout.versions.iter().rev().skip(1) {
for (id, _, role) in ver.roles.items().iter() {
if let layout::NodeRoleV(Some(r)) = role {
if !nodes.contains_key(id) && r.capacity.is_some() {
nodes.insert(
*id,
NodeResp {
id: hex::encode(id),
draining: true,
..Default::default()
},
);
}
}
}
}
let mut nodes = nodes.into_iter().map(|(_, v)| v).collect::<Vec<_>>();
nodes.sort_by(|x, y| x.id.cmp(&y.id));
let res = GetClusterStatusResponse { let res = GetClusterStatusResponse {
node: hex::encode(garage.system.id), node: hex::encode(garage.system.id),
garage_version: garage_util::version::garage_version(), garage_version: garage_util::version::garage_version(),
garage_features: garage_util::version::garage_features(), garage_features: garage_util::version::garage_features(),
rust_version: garage_util::version::rust_version(), rust_version: garage_util::version::rust_version(),
db_engine: garage.db.engine(), db_engine: garage.db.engine(),
known_nodes: garage layout_version: layout.current().version,
.system nodes,
.get_known_nodes()
.into_iter()
.map(|i| KnownNodeResp {
id: hex::encode(i.id),
addr: i.addr,
is_up: i.is_up,
last_seen_secs_ago: i.last_seen_secs_ago,
hostname: i.status.hostname,
})
.collect(),
layout: format_cluster_layout(&garage.system.cluster_layout()),
}; };
Ok(json_ok_response(&res)?) Ok(json_ok_response(&res)?)
@ -157,8 +228,8 @@ struct GetClusterStatusResponse {
garage_features: Option<&'static [&'static str]>, garage_features: Option<&'static [&'static str]>,
rust_version: &'static str, rust_version: &'static str,
db_engine: String, db_engine: String,
known_nodes: Vec<KnownNodeResp>, layout_version: u64,
layout: GetClusterLayoutResponse, nodes: Vec<NodeResp>,
} }
#[derive(Serialize)] #[derive(Serialize)]
@ -192,14 +263,27 @@ struct NodeRoleResp {
tags: Vec<String>, tags: Vec<String>,
} }
#[derive(Serialize)] #[derive(Serialize, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct KnownNodeResp { struct FreeSpaceResp {
available: u64,
total: u64,
}
#[derive(Serialize, Default)]
#[serde(rename_all = "camelCase")]
struct NodeResp {
id: String, id: String,
addr: SocketAddr, role: Option<NodeRoleResp>,
addr: Option<SocketAddr>,
hostname: Option<String>,
is_up: bool, is_up: bool,
last_seen_secs_ago: Option<u64>, last_seen_secs_ago: Option<u64>,
hostname: String, draining: bool,
#[serde(skip_serializing_if = "Option::is_none")]
data_partition: Option<FreeSpaceResp>,
#[serde(skip_serializing_if = "Option::is_none")]
metadata_partition: Option<FreeSpaceResp>,
} }
// ---- update functions ---- // ---- update functions ----

View file

@ -295,7 +295,7 @@ impl AdminRpcHandler {
let info = node_info.get(id); let info = node_info.get(id);
let status = info.map(|x| &x.status); let status = info.map(|x| &x.status);
let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref());
let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?");
let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?");
let capacity = role let capacity = role
.map(|x| x.capacity_string()) .map(|x| x.capacity_string())

View file

@ -62,6 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
let mut healthy_nodes = let mut healthy_nodes =
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()];
for adv in status.iter().filter(|adv| adv.is_up) { for adv in status.iter().filter(|adv| adv.is_up) {
let host = adv.status.hostname.as_deref().unwrap_or("?");
if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) {
let data_avail = match &adv.status.data_disk_avail { let data_avail = match &adv.status.data_disk_avail {
_ if cfg.capacity.is_none() => "N/A".into(), _ if cfg.capacity.is_none() => "N/A".into(),
@ -75,7 +76,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}",
id = adv.id, id = adv.id,
host = adv.status.hostname, host = host,
addr = adv.addr, addr = adv.addr,
tags = cfg.tags.join(","), tags = cfg.tags.join(","),
zone = cfg.zone, zone = cfg.zone,
@ -95,7 +96,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...",
id = adv.id, id = adv.id,
host = adv.status.hostname, host = host,
addr = adv.addr, addr = adv.addr,
tags = cfg.tags.join(","), tags = cfg.tags.join(","),
zone = cfg.zone, zone = cfg.zone,
@ -108,7 +109,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{h}\t{addr}\t\t\t{new_role}", "{id:?}\t{h}\t{addr}\t\t\t{new_role}",
id = adv.id, id = adv.id,
h = adv.status.hostname, h = host,
addr = adv.addr, addr = adv.addr,
new_role = new_role, new_role = new_role,
)); ));
@ -149,7 +150,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
// it is in a failed state, add proper line to the output // it is in a failed state, add proper line to the output
let (host, addr, last_seen) = match adv { let (host, addr, last_seen) = match adv {
Some(adv) => ( Some(adv) => (
adv.status.hostname.as_str(), adv.status.hostname.as_deref().unwrap_or("?"),
adv.addr.to_string(), adv.addr.to_string(),
adv.last_seen_secs_ago adv.last_seen_secs_ago
.map(|s| tf.convert(Duration::from_secs(s))) .map(|s| tf.convert(Duration::from_secs(s)))

View file

@ -126,7 +126,7 @@ pub struct System {
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeStatus { pub struct NodeStatus {
/// Hostname of the node /// Hostname of the node
pub hostname: String, pub hostname: Option<String>,
/// Replication factor configured on the node /// Replication factor configured on the node
pub replication_factor: usize, pub replication_factor: usize,
@ -765,9 +765,11 @@ impl EndpointHandler<SystemRpc> for System {
impl NodeStatus { impl NodeStatus {
fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self {
NodeStatus { NodeStatus {
hostname: gethostname::gethostname() hostname: Some(
gethostname::gethostname()
.into_string() .into_string()
.unwrap_or_else(|_| "<invalid utf-8>".to_string()), .unwrap_or_else(|_| "<invalid utf-8>".to_string()),
),
replication_factor, replication_factor,
layout_digest: layout_manager.layout().digest(), layout_digest: layout_manager.layout().digest(),
meta_disk_avail: None, meta_disk_avail: None,
@ -777,7 +779,7 @@ impl NodeStatus {
fn unknown() -> Self { fn unknown() -> Self {
NodeStatus { NodeStatus {
hostname: "?".to_string(), hostname: None,
replication_factor: 0, replication_factor: 0,
layout_digest: Default::default(), layout_digest: Default::default(),
meta_disk_avail: None, meta_disk_avail: None,