Make the repair command accept subcommands to not do everything all the time
This commit is contained in:
parent
a04218047e
commit
b1ddb933b0
3 changed files with 112 additions and 48 deletions
|
@ -9,3 +9,9 @@ Garage implements an S3-compatible object store with high resiliency to network
|
|||
```
|
||||
RUST_LOG=garage=debug cargo run --release -- server -c config_file.toml
|
||||
```
|
||||
|
||||
## What to repair
|
||||
|
||||
- `tables`: to do a full sync of metadata, should not be necessary because it is done every hour by the system
|
||||
- `versions` and `block_refs`: very time consuming, usefull if deletions have not been propagated, improves garbage collection
|
||||
- `blocks`: very usefull to resync/rebalance blocks betweeen nodes
|
||||
|
|
|
@ -21,7 +21,7 @@ pub const ADMIN_RPC_PATH: &str = "_admin";
|
|||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum AdminRPC {
|
||||
BucketOperation(BucketOperation),
|
||||
LaunchRepair(bool),
|
||||
LaunchRepair(RepairOpt),
|
||||
|
||||
// Replies
|
||||
Ok(String),
|
||||
|
@ -48,9 +48,7 @@ impl AdminRpcHandler {
|
|||
async move {
|
||||
match msg {
|
||||
AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await,
|
||||
AdminRPC::LaunchRepair(repair_all) => {
|
||||
self2.handle_launch_repair(repair_all).await
|
||||
}
|
||||
AdminRPC::LaunchRepair(opt) => self2.handle_launch_repair(opt).await,
|
||||
_ => Err(Error::Message(format!("Invalid RPC"))),
|
||||
}
|
||||
}
|
||||
|
@ -155,14 +153,26 @@ impl AdminRpcHandler {
|
|||
}
|
||||
}
|
||||
|
||||
async fn handle_launch_repair(self: &Arc<Self>, repair_all: bool) -> Result<AdminRPC, Error> {
|
||||
if repair_all {
|
||||
async fn handle_launch_repair(self: &Arc<Self>, opt: RepairOpt) -> Result<AdminRPC, Error> {
|
||||
if !opt.yes {
|
||||
return Err(Error::Message(format!(
|
||||
"Please provide the --yes flag to initiate repair operations."
|
||||
)));
|
||||
}
|
||||
if opt.all_nodes {
|
||||
let mut opt_to_send = opt.clone();
|
||||
opt_to_send.all_nodes = false;
|
||||
|
||||
let mut failures = vec![];
|
||||
let ring = self.garage.system.ring.borrow().clone();
|
||||
for node in ring.config.members.keys() {
|
||||
if self
|
||||
.rpc_client
|
||||
.call(node, AdminRPC::LaunchRepair(false), ADMIN_RPC_TIMEOUT)
|
||||
.call(
|
||||
node,
|
||||
AdminRPC::LaunchRepair(opt_to_send.clone()),
|
||||
ADMIN_RPC_TIMEOUT,
|
||||
)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
|
@ -183,7 +193,7 @@ impl AdminRpcHandler {
|
|||
.system
|
||||
.background
|
||||
.spawn_worker("Repair worker".into(), move |must_exit| async move {
|
||||
self2.repair_worker(must_exit).await
|
||||
self2.repair_worker(opt, must_exit).await
|
||||
})
|
||||
.await;
|
||||
Ok(AdminRPC::Ok(format!(
|
||||
|
@ -193,7 +203,15 @@ impl AdminRpcHandler {
|
|||
}
|
||||
}
|
||||
|
||||
async fn repair_worker(self: Arc<Self>, must_exit: watch::Receiver<bool>) -> Result<(), Error> {
|
||||
async fn repair_worker(
|
||||
self: Arc<Self>,
|
||||
opt: RepairOpt,
|
||||
must_exit: watch::Receiver<bool>,
|
||||
) -> Result<(), Error> {
|
||||
let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true);
|
||||
|
||||
if todo(RepairWhat::Tables) {
|
||||
info!("Launching a full sync of tables");
|
||||
self.garage
|
||||
.bucket_table
|
||||
.syncer
|
||||
|
@ -222,16 +240,33 @@ impl AdminRpcHandler {
|
|||
.unwrap()
|
||||
.add_full_scan()
|
||||
.await;
|
||||
}
|
||||
|
||||
// TODO: wait for full sync to finish before proceeding to the rest?
|
||||
|
||||
if todo(RepairWhat::Versions) {
|
||||
info!("Repairing the versions table");
|
||||
self.repair_versions(&must_exit).await?;
|
||||
}
|
||||
|
||||
if todo(RepairWhat::BlockRefs) {
|
||||
info!("Repairing the block refs table");
|
||||
self.repair_block_ref(&must_exit).await?;
|
||||
}
|
||||
|
||||
if opt.what.is_none() {
|
||||
info!("Repairing the RC");
|
||||
self.repair_rc(&must_exit).await?;
|
||||
}
|
||||
|
||||
if todo(RepairWhat::Blocks) {
|
||||
info!("Repairing the stored blocks");
|
||||
self.garage
|
||||
.block_manager
|
||||
.repair_data_store(&must_exit)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
31
src/main.rs
31
src/main.rs
|
@ -188,11 +188,34 @@ pub struct PermBucketOpt {
|
|||
pub bucket: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug)]
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
|
||||
pub struct RepairOpt {
|
||||
/// Launch repair operation on all nodes
|
||||
#[structopt(long = "all")]
|
||||
pub all: bool,
|
||||
#[structopt(short = "a", long = "all-nodes")]
|
||||
pub all_nodes: bool,
|
||||
|
||||
/// Confirm the launch of the repair operation
|
||||
#[structopt(long = "yes")]
|
||||
pub yes: bool,
|
||||
|
||||
#[structopt(subcommand)]
|
||||
pub what: Option<RepairWhat>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
|
||||
pub enum RepairWhat {
|
||||
/// Only do a full sync of metadata tables
|
||||
#[structopt(name = "tables")]
|
||||
Tables,
|
||||
/// Only repair (resync/rebalance) the set of stored blocks
|
||||
#[structopt(name = "blocks")]
|
||||
Blocks,
|
||||
/// Only redo the propagation of object deletions to the version table (slow)
|
||||
#[structopt(name = "versions")]
|
||||
Versions,
|
||||
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
|
||||
#[structopt(name = "block_refs")]
|
||||
BlockRefs,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
@ -241,7 +264,7 @@ async fn main() {
|
|||
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await
|
||||
}
|
||||
Command::Repair(ro) => {
|
||||
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro.all)).await
|
||||
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro)).await
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue