cli: improvements to the layout commands when multiple layouts are live

This commit is contained in:
Alex 2023-12-08 11:24:23 +01:00
parent 91b874c4ef
commit 7f2541101f
Signed by untrusted user: lx
GPG key ID: 0E496D15096376BE
4 changed files with 49 additions and 29 deletions

View file

@ -274,8 +274,7 @@ impl AdminRpcHandler {
fn gather_cluster_stats(&self) -> String { fn gather_cluster_stats(&self) -> String {
let mut ret = String::new(); let mut ret = String::new();
// Gather storage node and free space statistics // Gather storage node and free space statistics for current nodes
// TODO: not only layout.current() ???
let layout = &self.garage.system.cluster_layout(); let layout = &self.garage.system.cluster_layout();
let mut node_partition_count = HashMap::<Uuid, u64>::new(); let mut node_partition_count = HashMap::<Uuid, u64>::new();
for short_id in layout.current().ring_assignment_data.iter() { for short_id in layout.current().ring_assignment_data.iter() {

View file

@ -179,7 +179,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); println!("Your cluster is expecting to drain data from nodes that are currently unavailable.");
println!("If these nodes are definitely dead, please review the layout history with"); println!("If these nodes are definitely dead, please review the layout history with");
println!( println!(
"`garage layout history` and use `garage layout assume-sync` to force progress." "`garage layout history` and use `garage layout skip-dead-nodes` to force progress."
); );
} }
} }
@ -274,6 +274,6 @@ pub async fn fetch_status(
.await?? .await??
{ {
SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),
resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), resp => Err(Error::unexpected_rpc_message(resp)),
} }
} }

View file

@ -354,10 +354,14 @@ pub async fn cmd_layout_history(
)); ));
} }
format_table(table); format_table(table);
println!(); println!();
if layout.versions.len() > 1 {
println!("==== UPDATE TRACKERS ===="); println!("==== UPDATE TRACKERS ====");
println!("This is the internal data that Garage stores to know which nodes have what data."); println!("Several layout versions are currently live in the version, and data is being migrated.");
println!(
"This is the internal data that Garage stores to know which nodes have what data."
);
println!(); println!();
let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()];
let all_nodes = layout.get_all_nodes(); let all_nodes = layout.get_all_nodes();
@ -373,16 +377,21 @@ pub async fn cmd_layout_history(
table[1..].sort(); table[1..].sort();
format_table(table); format_table(table);
if layout.versions.len() > 1 {
println!(); println!();
println!( println!(
"If some nodes are not catching up to the latest layout version in the update tracker," "If some nodes are not catching up to the latest layout version in the update trackers,"
); );
println!("it might be because they are offline or unable to complete a sync successfully."); println!("it might be because they are offline or unable to complete a sync successfully.");
println!( println!(
"You may force progress using `garage layout assume-sync --version {}`", "You may force progress using `garage layout skip-dead-nodes --version {}`",
layout.current().version layout.current().version
); );
} else {
println!("Your cluster is currently in a stable state with a single live layout version.");
println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,");
println!(
"so you might want to keep old nodes online until their data directories become empty."
);
} }
Ok(()) Ok(())
@ -415,6 +424,7 @@ pub async fn cmd_layout_skip_dead_nodes(
} }
let all_nodes = layout.get_all_nodes(); let all_nodes = layout.get_all_nodes();
let mut did_something = false;
for node in all_nodes.iter() { for node in all_nodes.iter() {
if status.iter().any(|x| x.id == *node && x.is_up) { if status.iter().any(|x| x.id == *node && x.is_up) {
continue; continue;
@ -422,19 +432,28 @@ pub async fn cmd_layout_skip_dead_nodes(
if layout.update_trackers.ack_map.set_max(*node, opt.version) { if layout.update_trackers.ack_map.set_max(*node, opt.version) {
println!("Increased the ACK tracker for node {:?}", node); println!("Increased the ACK tracker for node {:?}", node);
did_something = true;
} }
if opt.allow_missing_data { if opt.allow_missing_data {
if layout.update_trackers.sync_map.set_max(*node, opt.version) { if layout.update_trackers.sync_map.set_max(*node, opt.version) {
println!("Increased the SYNC tracker for node {:?}", node); println!("Increased the SYNC tracker for node {:?}", node);
did_something = true;
} }
} }
} }
if did_something {
send_layout(rpc_cli, rpc_host, layout).await?; send_layout(rpc_cli, rpc_host, layout).await?;
println!("Success."); println!("Success.");
Ok(()) Ok(())
} else if !opt.allow_missing_data {
Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into()))
} else {
Err(Error::Message(
"Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(),
))
}
} }
// --- utility --- // --- utility ---
@ -448,7 +467,7 @@ pub async fn fetch_layout(
.await?? .await??
{ {
SystemRpc::AdvertiseClusterLayout(t) => Ok(t), SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), resp => Err(Error::unexpected_rpc_message(resp)),
} }
} }

View file

@ -450,6 +450,8 @@ pub fn print_block_info(
if refcount != nondeleted_count { if refcount != nondeleted_count {
println!(); println!();
println!("Warning: refcount does not match number of non-deleted versions"); println!(
"Warning: refcount does not match number of non-deleted versions (see issue #644)."
);
} }
} }