From c04dd8788a3764da2f307b1d10c2d56b7b0e4a61 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 14:25:04 +0100 Subject: [PATCH] admin: more info in admin GetClusterStatus --- doc/drafts/admin-api.md | 143 +++++++++++++++++++++------------------ src/api/admin/cluster.rs | 122 +++++++++++++++++++++++++++------ src/garage/admin/mod.rs | 2 +- src/garage/cli/cmd.rs | 9 +-- src/rpc/system.rs | 12 ++-- 5 files changed, 192 insertions(+), 96 deletions(-) diff --git a/doc/drafts/admin-api.md b/doc/drafts/admin-api.md index 411f6418..274bd5c4 100644 --- a/doc/drafts/admin-api.md +++ b/doc/drafts/admin-api.md @@ -69,8 +69,8 @@ Example response body: ```json { - "node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", - "garageVersion": "git:v0.9.0-dev", + "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", + "garageVersion": "v0.10.0", "garageFeatures": [ "k2v", "sled", @@ -81,83 +81,92 @@ Example response body: ], "rustVersion": "1.68.0", "dbEngine": "LMDB (using Heed crate)", - "knownNodes": [ + "layoutVersion": 5, + "nodes": [ { - "id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", - "addr": "10.0.0.11:3901", + "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c", + "role": { + "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c", + "zone": "dc1", + "capacity": 100000000000, + "tags": [] + }, + "addr": "10.0.0.3:3901", + "hostname": "node3", "isUp": true, - "lastSeenSecsAgo": 9, - "hostname": "node1" + "lastSeenSecsAgo": 12, + "draining": false, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } }, { - "id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff", - "addr": "10.0.0.12:3901", + "id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f", + "role": null, + "addr": "10.0.0.2:3901", + "hostname": "node2", "isUp": true, - "lastSeenSecsAgo": 1, - "hostname": "node2" + "lastSeenSecsAgo": 11, + "draining": true, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } }, { - "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", - "addr": "10.0.0.21:3901", + "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2", + "role": { + "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2", + "zone": "dc1", + "capacity": 100000000000, + "tags": [] + }, + "addr": "127.0.0.1:3904", + "hostname": "lindy", "isUp": true, - "lastSeenSecsAgo": 7, - "hostname": "node3" + "lastSeenSecsAgo": 2, + "draining": false, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } }, { - "id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b", - "addr": "10.0.0.22:3901", + "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", + "role": { + "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", + "zone": "dc1", + "capacity": 100000000000, + "tags": [] + }, + "addr": "10.0.0.1:3901", + "hostname": "node1", "isUp": true, - "lastSeenSecsAgo": 1, - "hostname": "node4" + "lastSeenSecsAgo": 3, + "draining": false, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } } - ], - "layout": { - "version": 12, - "roles": [ - { - "id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", - "zone": "dc1", - "capacity": 10737418240, - "tags": [ - "node1" - ] - }, - { - "id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff", - "zone": "dc1", - "capacity": 10737418240, - "tags": [ - "node2" - ] - }, - { - "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", - "zone": "dc2", - "capacity": 10737418240, - "tags": [ - "node3" - ] - } - ], - "stagedRoleChanges": [ - { - "id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b", - "remove": false, - "zone": "dc2", - "capacity": 10737418240, - "tags": [ - "node4" - ] - } - { - "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", - "remove": true, - "zone": null, - "capacity": null, - "tags": null, - } - ] - } + ] } ``` diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 593bd778..3ce1b254 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; @@ -15,25 +16,95 @@ use crate::admin::error::*; use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { + let layout = garage.system.cluster_layout(); + let mut nodes = garage + .system + .get_known_nodes() + .into_iter() + .map(|i| { + ( + i.id, + NodeResp { + id: hex::encode(i.id), + addr: Some(i.addr), + hostname: i.status.hostname, + is_up: i.is_up, + last_seen_secs_ago: i.last_seen_secs_ago, + data_partition: i + .status + .data_disk_avail + .map(|(avail, total)| FreeSpaceResp { + available: avail, + total, + }), + metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| { + FreeSpaceResp { + available: avail, + total, + } + }), + ..Default::default() + }, + ) + }) + .collect::>(); + + for (id, _, role) in layout.current().roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + let role = NodeRoleResp { + id: hex::encode(id), + zone: r.zone.to_string(), + capacity: r.capacity, + tags: r.tags.clone(), + }; + match nodes.get_mut(id) { + None => { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + role: Some(role), + ..Default::default() + }, + ); + } + Some(n) => { + if n.role.is_none() { + n.role = Some(role); + } + } + } + } + } + + for ver in layout.versions.iter().rev().skip(1) { + for (id, _, role) in ver.roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + if !nodes.contains_key(id) && r.capacity.is_some() { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + draining: true, + ..Default::default() + }, + ); + } + } + } + } + + let mut nodes = nodes.into_iter().map(|(_, v)| v).collect::>(); + nodes.sort_by(|x, y| x.id.cmp(&y.id)); + let res = GetClusterStatusResponse { node: hex::encode(garage.system.id), garage_version: garage_util::version::garage_version(), garage_features: garage_util::version::garage_features(), rust_version: garage_util::version::rust_version(), db_engine: garage.db.engine(), - known_nodes: garage - .system - .get_known_nodes() - .into_iter() - .map(|i| KnownNodeResp { - id: hex::encode(i.id), - addr: i.addr, - is_up: i.is_up, - last_seen_secs_ago: i.last_seen_secs_ago, - hostname: i.status.hostname, - }) - .collect(), - layout: format_cluster_layout(&garage.system.cluster_layout()), + layout_version: layout.current().version, + nodes, }; Ok(json_ok_response(&res)?) @@ -157,8 +228,8 @@ struct GetClusterStatusResponse { garage_features: Option<&'static [&'static str]>, rust_version: &'static str, db_engine: String, - known_nodes: Vec, - layout: GetClusterLayoutResponse, + layout_version: u64, + nodes: Vec, } #[derive(Serialize)] @@ -192,14 +263,27 @@ struct NodeRoleResp { tags: Vec, } -#[derive(Serialize)] +#[derive(Serialize, Default)] #[serde(rename_all = "camelCase")] -struct KnownNodeResp { +struct FreeSpaceResp { + available: u64, + total: u64, +} + +#[derive(Serialize, Default)] +#[serde(rename_all = "camelCase")] +struct NodeResp { id: String, - addr: SocketAddr, + role: Option, + addr: Option, + hostname: Option, is_up: bool, last_seen_secs_ago: Option, - hostname: String, + draining: bool, + #[serde(skip_serializing_if = "Option::is_none")] + data_partition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + metadata_partition: Option, } // ---- update functions ---- diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index 77918a0f..da4226cf 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -295,7 +295,7 @@ impl AdminRpcHandler { let info = node_info.get(id); let status = info.map(|x| &x.status); let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); - let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); + let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let capacity = role .map(|x| x.capacity_string()) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 4d1306b6..c7f0ad2b 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,6 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { + let host = adv.status.hostname.as_deref().unwrap_or("?"); if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -75,7 +76,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -95,7 +96,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -108,7 +109,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, - h = adv.status.hostname, + h = host, addr = adv.addr, new_role = new_role, )); @@ -149,7 +150,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( - adv.status.hostname.as_str(), + adv.status.hostname.as_deref().unwrap_or("?"), adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c7d41ee4..be4aefa2 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -126,7 +126,7 @@ pub struct System { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NodeStatus { /// Hostname of the node - pub hostname: String, + pub hostname: Option, /// Replication factor configured on the node pub replication_factor: usize, @@ -765,9 +765,11 @@ impl EndpointHandler for System { impl NodeStatus { fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { NodeStatus { - hostname: gethostname::gethostname() - .into_string() - .unwrap_or_else(|_| "".to_string()), + hostname: Some( + gethostname::gethostname() + .into_string() + .unwrap_or_else(|_| "".to_string()), + ), replication_factor, layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, @@ -777,7 +779,7 @@ impl NodeStatus { fn unknown() -> Self { NodeStatus { - hostname: "?".to_string(), + hostname: None, replication_factor: 0, layout_digest: Default::default(), meta_disk_avail: None,