admin: more info in admin GetClusterStatus

This commit is contained in:
Alex 2023-11-28 14:25:04 +01:00
parent 539af6eac4
commit c04dd8788a
Signed by untrusted user: lx
GPG key ID: 0E496D15096376BE
5 changed files with 192 additions and 96 deletions

View file

@ -69,8 +69,8 @@ Example response body:
```json
{
"node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
"garageVersion": "git:v0.9.0-dev",
"node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"garageVersion": "v0.10.0",
"garageFeatures": [
"k2v",
"sled",
@ -81,84 +81,93 @@ Example response body:
],
"rustVersion": "1.68.0",
"dbEngine": "LMDB (using Heed crate)",
"knownNodes": [
"layoutVersion": 5,
"nodes": [
{
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
"addr": "10.0.0.11:3901",
"isUp": true,
"lastSeenSecsAgo": 9,
"hostname": "node1"
},
{
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
"addr": "10.0.0.12:3901",
"isUp": true,
"lastSeenSecsAgo": 1,
"hostname": "node2"
},
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"addr": "10.0.0.21:3901",
"isUp": true,
"lastSeenSecsAgo": 7,
"hostname": "node3"
},
{
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
"addr": "10.0.0.22:3901",
"isUp": true,
"lastSeenSecsAgo": 1,
"hostname": "node4"
}
],
"layout": {
"version": 12,
"roles": [
{
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
"id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
"role": {
"id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
"zone": "dc1",
"capacity": 10737418240,
"tags": [
"node1"
]
"capacity": 100000000000,
"tags": []
},
"addr": "10.0.0.3:3901",
"hostname": "node3",
"isUp": true,
"lastSeenSecsAgo": 12,
"draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
},
{
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
"id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f",
"role": null,
"addr": "10.0.0.2:3901",
"hostname": "node2",
"isUp": true,
"lastSeenSecsAgo": 11,
"draining": true,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
},
{
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
"role": {
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
"zone": "dc1",
"capacity": 10737418240,
"tags": [
"node2"
]
"capacity": 100000000000,
"tags": []
},
"addr": "127.0.0.1:3904",
"hostname": "lindy",
"isUp": true,
"lastSeenSecsAgo": 2,
"draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
},
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"zone": "dc2",
"capacity": 10737418240,
"tags": [
"node3"
]
"id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"role": {
"id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"zone": "dc1",
"capacity": 100000000000,
"tags": []
},
"addr": "10.0.0.1:3901",
"hostname": "node1",
"isUp": true,
"lastSeenSecsAgo": 3,
"draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
],
"stagedRoleChanges": [
{
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
"remove": false,
"zone": "dc2",
"capacity": 10737418240,
"tags": [
"node4"
]
}
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"remove": true,
"zone": null,
"capacity": null,
"tags": null,
}
]
}
}
```
#### GetClusterHealth `GET /v1/health`

View file

@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::net::SocketAddr;
use std::sync::Arc;
@ -15,25 +16,95 @@ use crate::admin::error::*;
use crate::helpers::{json_ok_response, parse_json_body};
pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
let layout = garage.system.cluster_layout();
let mut nodes = garage
.system
.get_known_nodes()
.into_iter()
.map(|i| {
(
i.id,
NodeResp {
id: hex::encode(i.id),
addr: Some(i.addr),
hostname: i.status.hostname,
is_up: i.is_up,
last_seen_secs_ago: i.last_seen_secs_ago,
data_partition: i
.status
.data_disk_avail
.map(|(avail, total)| FreeSpaceResp {
available: avail,
total,
}),
metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| {
FreeSpaceResp {
available: avail,
total,
}
}),
..Default::default()
},
)
})
.collect::<HashMap<_, _>>();
for (id, _, role) in layout.current().roles.items().iter() {
if let layout::NodeRoleV(Some(r)) = role {
let role = NodeRoleResp {
id: hex::encode(id),
zone: r.zone.to_string(),
capacity: r.capacity,
tags: r.tags.clone(),
};
match nodes.get_mut(id) {
None => {
nodes.insert(
*id,
NodeResp {
id: hex::encode(id),
role: Some(role),
..Default::default()
},
);
}
Some(n) => {
if n.role.is_none() {
n.role = Some(role);
}
}
}
}
}
for ver in layout.versions.iter().rev().skip(1) {
for (id, _, role) in ver.roles.items().iter() {
if let layout::NodeRoleV(Some(r)) = role {
if !nodes.contains_key(id) && r.capacity.is_some() {
nodes.insert(
*id,
NodeResp {
id: hex::encode(id),
draining: true,
..Default::default()
},
);
}
}
}
}
let mut nodes = nodes.into_iter().map(|(_, v)| v).collect::<Vec<_>>();
nodes.sort_by(|x, y| x.id.cmp(&y.id));
let res = GetClusterStatusResponse {
node: hex::encode(garage.system.id),
garage_version: garage_util::version::garage_version(),
garage_features: garage_util::version::garage_features(),
rust_version: garage_util::version::rust_version(),
db_engine: garage.db.engine(),
known_nodes: garage
.system
.get_known_nodes()
.into_iter()
.map(|i| KnownNodeResp {
id: hex::encode(i.id),
addr: i.addr,
is_up: i.is_up,
last_seen_secs_ago: i.last_seen_secs_ago,
hostname: i.status.hostname,
})
.collect(),
layout: format_cluster_layout(&garage.system.cluster_layout()),
layout_version: layout.current().version,
nodes,
};
Ok(json_ok_response(&res)?)
@ -157,8 +228,8 @@ struct GetClusterStatusResponse {
garage_features: Option<&'static [&'static str]>,
rust_version: &'static str,
db_engine: String,
known_nodes: Vec<KnownNodeResp>,
layout: GetClusterLayoutResponse,
layout_version: u64,
nodes: Vec<NodeResp>,
}
#[derive(Serialize)]
@ -192,14 +263,27 @@ struct NodeRoleResp {
tags: Vec<String>,
}
#[derive(Serialize)]
#[derive(Serialize, Default)]
#[serde(rename_all = "camelCase")]
struct KnownNodeResp {
struct FreeSpaceResp {
available: u64,
total: u64,
}
#[derive(Serialize, Default)]
#[serde(rename_all = "camelCase")]
struct NodeResp {
id: String,
addr: SocketAddr,
role: Option<NodeRoleResp>,
addr: Option<SocketAddr>,
hostname: Option<String>,
is_up: bool,
last_seen_secs_ago: Option<u64>,
hostname: String,
draining: bool,
#[serde(skip_serializing_if = "Option::is_none")]
data_partition: Option<FreeSpaceResp>,
#[serde(skip_serializing_if = "Option::is_none")]
metadata_partition: Option<FreeSpaceResp>,
}
// ---- update functions ----

View file

@ -295,7 +295,7 @@ impl AdminRpcHandler {
let info = node_info.get(id);
let status = info.map(|x| &x.status);
let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref());
let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?");
let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?");
let zone = role.map(|x| x.zone.as_str()).unwrap_or("?");
let capacity = role
.map(|x| x.capacity_string())

View file

@ -62,6 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
let mut healthy_nodes =
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()];
for adv in status.iter().filter(|adv| adv.is_up) {
let host = adv.status.hostname.as_deref().unwrap_or("?");
if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) {
let data_avail = match &adv.status.data_disk_avail {
_ if cfg.capacity.is_none() => "N/A".into(),
@ -75,7 +76,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}",
id = adv.id,
host = adv.status.hostname,
host = host,
addr = adv.addr,
tags = cfg.tags.join(","),
zone = cfg.zone,
@ -95,7 +96,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...",
id = adv.id,
host = adv.status.hostname,
host = host,
addr = adv.addr,
tags = cfg.tags.join(","),
zone = cfg.zone,
@ -108,7 +109,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
healthy_nodes.push(format!(
"{id:?}\t{h}\t{addr}\t\t\t{new_role}",
id = adv.id,
h = adv.status.hostname,
h = host,
addr = adv.addr,
new_role = new_role,
));
@ -149,7 +150,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
// it is in a failed state, add proper line to the output
let (host, addr, last_seen) = match adv {
Some(adv) => (
adv.status.hostname.as_str(),
adv.status.hostname.as_deref().unwrap_or("?"),
adv.addr.to_string(),
adv.last_seen_secs_ago
.map(|s| tf.convert(Duration::from_secs(s)))

View file

@ -126,7 +126,7 @@ pub struct System {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeStatus {
/// Hostname of the node
pub hostname: String,
pub hostname: Option<String>,
/// Replication factor configured on the node
pub replication_factor: usize,
@ -765,9 +765,11 @@ impl EndpointHandler<SystemRpc> for System {
impl NodeStatus {
fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self {
NodeStatus {
hostname: gethostname::gethostname()
hostname: Some(
gethostname::gethostname()
.into_string()
.unwrap_or_else(|_| "<invalid utf-8>".to_string()),
),
replication_factor,
layout_digest: layout_manager.layout().digest(),
meta_disk_avail: None,
@ -777,7 +779,7 @@ impl NodeStatus {
fn unknown() -> Self {
NodeStatus {
hostname: "?".to_string(),
hostname: None,
replication_factor: 0,
layout_digest: Default::default(),
meta_disk_avail: None,