Make the repair command accept subcommands to not do everything all the time

This commit is contained in:
Alex 2020-04-21 16:40:17 +00:00
parent a04218047e
commit b1ddb933b0
3 changed files with 112 additions and 48 deletions

View file

@ -9,3 +9,9 @@ Garage implements an S3-compatible object store with high resiliency to network
``` ```
RUST_LOG=garage=debug cargo run --release -- server -c config_file.toml RUST_LOG=garage=debug cargo run --release -- server -c config_file.toml
``` ```
## What to repair
- `tables`: to do a full sync of metadata, should not be necessary because it is done every hour by the system
- `versions` and `block_refs`: very time consuming, usefull if deletions have not been propagated, improves garbage collection
- `blocks`: very usefull to resync/rebalance blocks betweeen nodes

View file

@ -21,7 +21,7 @@ pub const ADMIN_RPC_PATH: &str = "_admin";
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
pub enum AdminRPC { pub enum AdminRPC {
BucketOperation(BucketOperation), BucketOperation(BucketOperation),
LaunchRepair(bool), LaunchRepair(RepairOpt),
// Replies // Replies
Ok(String), Ok(String),
@ -48,9 +48,7 @@ impl AdminRpcHandler {
async move { async move {
match msg { match msg {
AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await, AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await,
AdminRPC::LaunchRepair(repair_all) => { AdminRPC::LaunchRepair(opt) => self2.handle_launch_repair(opt).await,
self2.handle_launch_repair(repair_all).await
}
_ => Err(Error::Message(format!("Invalid RPC"))), _ => Err(Error::Message(format!("Invalid RPC"))),
} }
} }
@ -155,14 +153,26 @@ impl AdminRpcHandler {
} }
} }
async fn handle_launch_repair(self: &Arc<Self>, repair_all: bool) -> Result<AdminRPC, Error> { async fn handle_launch_repair(self: &Arc<Self>, opt: RepairOpt) -> Result<AdminRPC, Error> {
if repair_all { if !opt.yes {
return Err(Error::Message(format!(
"Please provide the --yes flag to initiate repair operations."
)));
}
if opt.all_nodes {
let mut opt_to_send = opt.clone();
opt_to_send.all_nodes = false;
let mut failures = vec![]; let mut failures = vec![];
let ring = self.garage.system.ring.borrow().clone(); let ring = self.garage.system.ring.borrow().clone();
for node in ring.config.members.keys() { for node in ring.config.members.keys() {
if self if self
.rpc_client .rpc_client
.call(node, AdminRPC::LaunchRepair(false), ADMIN_RPC_TIMEOUT) .call(
node,
AdminRPC::LaunchRepair(opt_to_send.clone()),
ADMIN_RPC_TIMEOUT,
)
.await .await
.is_err() .is_err()
{ {
@ -183,7 +193,7 @@ impl AdminRpcHandler {
.system .system
.background .background
.spawn_worker("Repair worker".into(), move |must_exit| async move { .spawn_worker("Repair worker".into(), move |must_exit| async move {
self2.repair_worker(must_exit).await self2.repair_worker(opt, must_exit).await
}) })
.await; .await;
Ok(AdminRPC::Ok(format!( Ok(AdminRPC::Ok(format!(
@ -193,7 +203,15 @@ impl AdminRpcHandler {
} }
} }
async fn repair_worker(self: Arc<Self>, must_exit: watch::Receiver<bool>) -> Result<(), Error> { async fn repair_worker(
self: Arc<Self>,
opt: RepairOpt,
must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true);
if todo(RepairWhat::Tables) {
info!("Launching a full sync of tables");
self.garage self.garage
.bucket_table .bucket_table
.syncer .syncer
@ -222,16 +240,33 @@ impl AdminRpcHandler {
.unwrap() .unwrap()
.add_full_scan() .add_full_scan()
.await; .await;
}
// TODO: wait for full sync to finish before proceeding to the rest? // TODO: wait for full sync to finish before proceeding to the rest?
if todo(RepairWhat::Versions) {
info!("Repairing the versions table");
self.repair_versions(&must_exit).await?; self.repair_versions(&must_exit).await?;
}
if todo(RepairWhat::BlockRefs) {
info!("Repairing the block refs table");
self.repair_block_ref(&must_exit).await?; self.repair_block_ref(&must_exit).await?;
}
if opt.what.is_none() {
info!("Repairing the RC");
self.repair_rc(&must_exit).await?; self.repair_rc(&must_exit).await?;
}
if todo(RepairWhat::Blocks) {
info!("Repairing the stored blocks");
self.garage self.garage
.block_manager .block_manager
.repair_data_store(&must_exit) .repair_data_store(&must_exit)
.await?; .await?;
}
Ok(()) Ok(())
} }

View file

@ -188,11 +188,34 @@ pub struct PermBucketOpt {
pub bucket: String, pub bucket: String,
} }
#[derive(Serialize, Deserialize, StructOpt, Debug)] #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct RepairOpt { pub struct RepairOpt {
/// Launch repair operation on all nodes /// Launch repair operation on all nodes
#[structopt(long = "all")] #[structopt(short = "a", long = "all-nodes")]
pub all: bool, pub all_nodes: bool,
/// Confirm the launch of the repair operation
#[structopt(long = "yes")]
pub yes: bool,
#[structopt(subcommand)]
pub what: Option<RepairWhat>,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum RepairWhat {
/// Only do a full sync of metadata tables
#[structopt(name = "tables")]
Tables,
/// Only repair (resync/rebalance) the set of stored blocks
#[structopt(name = "blocks")]
Blocks,
/// Only redo the propagation of object deletions to the version table (slow)
#[structopt(name = "versions")]
Versions,
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
#[structopt(name = "block_refs")]
BlockRefs,
} }
#[tokio::main] #[tokio::main]
@ -241,7 +264,7 @@ async fn main() {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await
} }
Command::Repair(ro) => { Command::Repair(ro) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro.all)).await cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro)).await
} }
}; };