Make the repair command accept subcommands to not do everything all the time
This commit is contained in:
parent
a04218047e
commit
b1ddb933b0
3 changed files with 112 additions and 48 deletions
|
@ -9,3 +9,9 @@ Garage implements an S3-compatible object store with high resiliency to network
|
||||||
```
|
```
|
||||||
RUST_LOG=garage=debug cargo run --release -- server -c config_file.toml
|
RUST_LOG=garage=debug cargo run --release -- server -c config_file.toml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## What to repair
|
||||||
|
|
||||||
|
- `tables`: to do a full sync of metadata, should not be necessary because it is done every hour by the system
|
||||||
|
- `versions` and `block_refs`: very time consuming, usefull if deletions have not been propagated, improves garbage collection
|
||||||
|
- `blocks`: very usefull to resync/rebalance blocks betweeen nodes
|
||||||
|
|
|
@ -21,7 +21,7 @@ pub const ADMIN_RPC_PATH: &str = "_admin";
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub enum AdminRPC {
|
pub enum AdminRPC {
|
||||||
BucketOperation(BucketOperation),
|
BucketOperation(BucketOperation),
|
||||||
LaunchRepair(bool),
|
LaunchRepair(RepairOpt),
|
||||||
|
|
||||||
// Replies
|
// Replies
|
||||||
Ok(String),
|
Ok(String),
|
||||||
|
@ -48,9 +48,7 @@ impl AdminRpcHandler {
|
||||||
async move {
|
async move {
|
||||||
match msg {
|
match msg {
|
||||||
AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await,
|
AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await,
|
||||||
AdminRPC::LaunchRepair(repair_all) => {
|
AdminRPC::LaunchRepair(opt) => self2.handle_launch_repair(opt).await,
|
||||||
self2.handle_launch_repair(repair_all).await
|
|
||||||
}
|
|
||||||
_ => Err(Error::Message(format!("Invalid RPC"))),
|
_ => Err(Error::Message(format!("Invalid RPC"))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -155,14 +153,26 @@ impl AdminRpcHandler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_launch_repair(self: &Arc<Self>, repair_all: bool) -> Result<AdminRPC, Error> {
|
async fn handle_launch_repair(self: &Arc<Self>, opt: RepairOpt) -> Result<AdminRPC, Error> {
|
||||||
if repair_all {
|
if !opt.yes {
|
||||||
|
return Err(Error::Message(format!(
|
||||||
|
"Please provide the --yes flag to initiate repair operations."
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if opt.all_nodes {
|
||||||
|
let mut opt_to_send = opt.clone();
|
||||||
|
opt_to_send.all_nodes = false;
|
||||||
|
|
||||||
let mut failures = vec![];
|
let mut failures = vec![];
|
||||||
let ring = self.garage.system.ring.borrow().clone();
|
let ring = self.garage.system.ring.borrow().clone();
|
||||||
for node in ring.config.members.keys() {
|
for node in ring.config.members.keys() {
|
||||||
if self
|
if self
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.call(node, AdminRPC::LaunchRepair(false), ADMIN_RPC_TIMEOUT)
|
.call(
|
||||||
|
node,
|
||||||
|
AdminRPC::LaunchRepair(opt_to_send.clone()),
|
||||||
|
ADMIN_RPC_TIMEOUT,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.is_err()
|
.is_err()
|
||||||
{
|
{
|
||||||
|
@ -183,7 +193,7 @@ impl AdminRpcHandler {
|
||||||
.system
|
.system
|
||||||
.background
|
.background
|
||||||
.spawn_worker("Repair worker".into(), move |must_exit| async move {
|
.spawn_worker("Repair worker".into(), move |must_exit| async move {
|
||||||
self2.repair_worker(must_exit).await
|
self2.repair_worker(opt, must_exit).await
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
Ok(AdminRPC::Ok(format!(
|
Ok(AdminRPC::Ok(format!(
|
||||||
|
@ -193,7 +203,15 @@ impl AdminRpcHandler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn repair_worker(self: Arc<Self>, must_exit: watch::Receiver<bool>) -> Result<(), Error> {
|
async fn repair_worker(
|
||||||
|
self: Arc<Self>,
|
||||||
|
opt: RepairOpt,
|
||||||
|
must_exit: watch::Receiver<bool>,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true);
|
||||||
|
|
||||||
|
if todo(RepairWhat::Tables) {
|
||||||
|
info!("Launching a full sync of tables");
|
||||||
self.garage
|
self.garage
|
||||||
.bucket_table
|
.bucket_table
|
||||||
.syncer
|
.syncer
|
||||||
|
@ -222,16 +240,33 @@ impl AdminRpcHandler {
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.add_full_scan()
|
.add_full_scan()
|
||||||
.await;
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: wait for full sync to finish before proceeding to the rest?
|
// TODO: wait for full sync to finish before proceeding to the rest?
|
||||||
|
|
||||||
|
if todo(RepairWhat::Versions) {
|
||||||
|
info!("Repairing the versions table");
|
||||||
self.repair_versions(&must_exit).await?;
|
self.repair_versions(&must_exit).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if todo(RepairWhat::BlockRefs) {
|
||||||
|
info!("Repairing the block refs table");
|
||||||
self.repair_block_ref(&must_exit).await?;
|
self.repair_block_ref(&must_exit).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if opt.what.is_none() {
|
||||||
|
info!("Repairing the RC");
|
||||||
self.repair_rc(&must_exit).await?;
|
self.repair_rc(&must_exit).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if todo(RepairWhat::Blocks) {
|
||||||
|
info!("Repairing the stored blocks");
|
||||||
self.garage
|
self.garage
|
||||||
.block_manager
|
.block_manager
|
||||||
.repair_data_store(&must_exit)
|
.repair_data_store(&must_exit)
|
||||||
.await?;
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
31
src/main.rs
31
src/main.rs
|
@ -188,11 +188,34 @@ pub struct PermBucketOpt {
|
||||||
pub bucket: String,
|
pub bucket: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, StructOpt, Debug)]
|
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
|
||||||
pub struct RepairOpt {
|
pub struct RepairOpt {
|
||||||
/// Launch repair operation on all nodes
|
/// Launch repair operation on all nodes
|
||||||
#[structopt(long = "all")]
|
#[structopt(short = "a", long = "all-nodes")]
|
||||||
pub all: bool,
|
pub all_nodes: bool,
|
||||||
|
|
||||||
|
/// Confirm the launch of the repair operation
|
||||||
|
#[structopt(long = "yes")]
|
||||||
|
pub yes: bool,
|
||||||
|
|
||||||
|
#[structopt(subcommand)]
|
||||||
|
pub what: Option<RepairWhat>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
|
||||||
|
pub enum RepairWhat {
|
||||||
|
/// Only do a full sync of metadata tables
|
||||||
|
#[structopt(name = "tables")]
|
||||||
|
Tables,
|
||||||
|
/// Only repair (resync/rebalance) the set of stored blocks
|
||||||
|
#[structopt(name = "blocks")]
|
||||||
|
Blocks,
|
||||||
|
/// Only redo the propagation of object deletions to the version table (slow)
|
||||||
|
#[structopt(name = "versions")]
|
||||||
|
Versions,
|
||||||
|
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
|
||||||
|
#[structopt(name = "block_refs")]
|
||||||
|
BlockRefs,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
@ -241,7 +264,7 @@ async fn main() {
|
||||||
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await
|
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await
|
||||||
}
|
}
|
||||||
Command::Repair(ro) => {
|
Command::Repair(ro) => {
|
||||||
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro.all)).await
|
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro)).await
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue