layout cli: safer skip-dead-nodes command

This commit is contained in:
Alex 2023-12-07 11:50:00 +01:00
parent d90de365b3
commit aa59059a91
Signed by: lx
GPG key ID: 0E496D15096376BE
3 changed files with 49 additions and 21 deletions

View file

@ -49,13 +49,7 @@ pub async fn cli_command_dispatch(
}
pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let status = fetch_status(rpc_cli, rpc_host).await?;
let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== HEALTHY NODES ====");
@ -268,3 +262,18 @@ pub async fn cmd_admin(
}
Ok(())
}
// ---- utility ----
pub async fn fetch_status(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
) -> Result<Vec<KnownNodeInfo>, Error> {
match rpc_cli
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),
resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
}
}

View file

@ -33,8 +33,8 @@ pub async fn cli_layout_command_dispatch(
cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await
}
LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await,
LayoutOperation::AssumeSync(assume_sync_opt) => {
cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await
LayoutOperation::SkipDeadNodes(assume_sync_opt) => {
cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await
}
}
}
@ -388,13 +388,21 @@ pub async fn cmd_layout_history(
Ok(())
}
pub async fn cmd_layout_assume_sync(
pub async fn cmd_layout_skip_dead_nodes(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
opt: AssumeSyncOpt,
opt: SkipDeadNodesOpt,
) -> Result<(), Error> {
let status = fetch_status(rpc_cli, rpc_host).await?;
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
if layout.versions.len() == 1 {
return Err(Error::Message(
"This command cannot be called when there is only one live cluster layout version"
.into(),
));
}
let min_v = layout.min_stored();
if opt.version <= min_v || opt.version > layout.current().version {
return Err(Error::Message(format!(
@ -408,12 +416,19 @@ pub async fn cmd_layout_assume_sync(
let all_nodes = layout.get_all_nodes();
for node in all_nodes.iter() {
layout.update_trackers.ack_map.set_max(*node, opt.version);
layout.update_trackers.sync_map.set_max(*node, opt.version);
layout
.update_trackers
.sync_ack_map
.set_max(*node, opt.version);
if status.iter().any(|x| x.id == *node && x.is_up) {
continue;
}
if layout.update_trackers.ack_map.set_max(*node, opt.version) {
println!("Increased the ACK tracker for node {:?}", node);
}
if opt.allow_missing_data {
if layout.update_trackers.sync_map.set_max(*node, opt.version) {
println!("Increased the SYNC tracker for node {:?}", node);
}
}
}
send_layout(rpc_cli, rpc_host, layout).await?;

View file

@ -117,9 +117,9 @@ pub enum LayoutOperation {
#[structopt(name = "history", version = garage_version())]
History,
/// Assume all nodes are synchronized up to a certain layout version
#[structopt(name = "assume-sync", version = garage_version())]
AssumeSync(AssumeSyncOpt),
/// Skip dead nodes when awaiting for a new layout version to be synchronized
#[structopt(name = "skip-dead-nodes", version = garage_version())]
SkipDeadNodes(SkipDeadNodesOpt),
}
#[derive(StructOpt, Debug)]
@ -178,11 +178,15 @@ pub struct RevertLayoutOpt {
}
#[derive(StructOpt, Debug)]
pub struct AssumeSyncOpt {
pub struct SkipDeadNodesOpt {
/// Version number of the layout to assume is currently up-to-date.
/// This will generally be the current layout version.
#[structopt(long = "version")]
pub(crate) version: u64,
/// Allow the skip even if a quorum of ndoes could not be found for
/// the data among the remaining nodes
#[structopt(long = "allow-missing-data")]
pub(crate) allow_missing_data: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]