Report progress of scrub and block repair
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is passing

This commit is contained in:
Alex 2022-06-27 16:52:46 +02:00
parent fc50724256
commit 0e5175abee
Signed by: lx
GPG key ID: 0E496D15096376BE
4 changed files with 133 additions and 53 deletions

View file

@ -3,26 +3,25 @@
all:
clear; cargo build --all-features
doc:
cd doc/book; mdbook build
release:
nix-build --arg release true
shell:
nix-shell
# ----
run1:
RUST_LOG=garage=debug ./target/debug/garage -c tmp/config.1.toml server
RUST_LOG=garage=debug ./target/debug/garage -c tmp/config1.toml server
run1rel:
RUST_LOG=garage=debug ./target/release/garage -c tmp/config.1.toml server
RUST_LOG=garage=debug ./target/release/garage -c tmp/config1.toml server
run2:
RUST_LOG=garage=debug ./target/debug/garage -c tmp/config.2.toml server
RUST_LOG=garage=debug ./target/debug/garage -c tmp/config2.toml server
run2rel:
RUST_LOG=garage=debug ./target/release/garage -c tmp/config.2.toml server
RUST_LOG=garage=debug ./target/release/garage -c tmp/config2.toml server
run3:
RUST_LOG=garage=debug ./target/debug/garage -c tmp/config.3.toml server
RUST_LOG=garage=debug ./target/debug/garage -c tmp/config3.toml server
run3rel:
RUST_LOG=garage=debug ./target/release/garage -c tmp/config.3.toml server
RUST_LOG=garage=debug ./target/release/garage -c tmp/config3.toml server

View file

@ -1,4 +1,5 @@
use core::ops::Bound;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
@ -36,6 +37,25 @@ impl Worker for RepairWorker {
"Block repair worker".into()
}
fn info(&self) -> Option<String> {
match self.block_iter.as_ref() {
None => {
let idx_bytes = self
.next_start
.as_ref()
.map(|x| x.as_slice())
.unwrap_or(&[]);
let idx_bytes = if idx_bytes.len() > 4 {
&idx_bytes[..4]
} else {
idx_bytes
};
Some(format!("Phase 1: {}", hex::encode(idx_bytes)))
}
Some(bi) => Some(format!("Phase 2: {:.2}% done", bi.progress() * 100.)),
}
}
async fn work(
&mut self,
_must_exit: &mut watch::Receiver<bool>,
@ -74,7 +94,7 @@ impl Worker for RepairWorker {
}
if batch_of_hashes.is_empty() {
// move on to phase 2
self.block_iter = Some(BlockStoreIterator::new(&self.manager).await?);
self.block_iter = Some(BlockStoreIterator::new(&self.manager));
return Ok(WorkerStatus::Busy);
}
@ -115,14 +135,14 @@ pub struct ScrubWorker {
}
impl ScrubWorker {
pub async fn new(manager: Arc<BlockManager>, tranquility: u32) -> Result<Self, Error> {
let iterator = BlockStoreIterator::new(&manager).await?;
Ok(Self {
pub fn new(manager: Arc<BlockManager>, tranquility: u32) -> Self {
let iterator = BlockStoreIterator::new(&manager);
Self {
manager,
iterator,
tranquilizer: Tranquilizer::new(30),
tranquility,
})
}
}
}
@ -132,6 +152,10 @@ impl Worker for ScrubWorker {
"Block scrub worker".into()
}
fn info(&self) -> Option<String> {
Some(format!("{:.2}% done", self.iterator.progress() * 100.))
}
async fn work(
&mut self,
_must_exit: &mut watch::Receiver<bool>,
@ -153,51 +177,107 @@ impl Worker for ScrubWorker {
// ----
struct BlockStoreIterator {
path: Vec<fs::ReadDir>,
path: Vec<ReadingDir>,
}
enum ReadingDir {
Pending(PathBuf),
Read {
subpaths: Vec<fs::DirEntry>,
pos: usize,
},
}
impl BlockStoreIterator {
async fn new(manager: &BlockManager) -> Result<Self, Error> {
fn new(manager: &BlockManager) -> Self {
let root_dir = manager.data_dir.clone();
let read_root_dir = fs::read_dir(&root_dir).await?;
Ok(Self {
path: vec![read_root_dir],
})
Self {
path: vec![ReadingDir::Pending(root_dir)],
}
}
/// Returns progress done, between 0% and 1%
fn progress(&self) -> f32 {
if self.path.is_empty() {
1.0
} else {
let mut ret = 0.0;
let mut next_div = 1;
for p in self.path.iter() {
match p {
ReadingDir::Pending(_) => break,
ReadingDir::Read { subpaths, pos } => {
next_div *= subpaths.len();
ret += ((*pos - 1) as f32) / (next_div as f32);
}
}
}
ret
}
}
async fn next(&mut self) -> Result<Option<Hash>, Error> {
loop {
if let Some(reader) = self.path.last_mut() {
if let Some(data_dir_ent) = reader.next_entry().await? {
let name = data_dir_ent.file_name();
let name = if let Ok(n) = name.into_string() {
n
} else {
continue;
};
let ent_type = data_dir_ent.file_type().await?;
let last_path = match self.path.last_mut() {
None => return Ok(None),
Some(lp) => lp,
};
let name = name.strip_suffix(".zst").unwrap_or(&name);
if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() {
let read_child_dir = fs::read_dir(&data_dir_ent.path()).await?;
self.path.push(read_child_dir);
continue;
} else if name.len() == 64 {
let hash_bytes = if let Ok(h) = hex::decode(&name) {
h
} else {
continue;
};
let mut hash = [0u8; 32];
hash.copy_from_slice(&hash_bytes[..]);
return Ok(Some(hash.into()));
}
} else {
if let ReadingDir::Pending(path) = last_path {
let mut reader = fs::read_dir(&path).await?;
let mut subpaths = vec![];
while let Some(ent) = reader.next_entry().await? {
subpaths.push(ent);
}
*last_path = ReadingDir::Read { subpaths, pos: 0 };
}
let (subpaths, pos) = match *last_path {
ReadingDir::Read {
ref subpaths,
ref mut pos,
} => (subpaths, pos),
ReadingDir::Pending(_) => unreachable!(),
};
if *pos >= subpaths.len() {
self.path.pop();
continue;
}
let data_dir_ent = match subpaths.get(*pos) {
None => {
self.path.pop();
continue;
}
Some(ent) => {
*pos += 1;
ent
}
};
let name = data_dir_ent.file_name();
let name = if let Ok(n) = name.into_string() {
n
} else {
return Ok(None);
continue;
};
let ent_type = data_dir_ent.file_type().await?;
let name = name.strip_suffix(".zst").unwrap_or(&name);
if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() {
let path = data_dir_ent.path();
self.path.push(ReadingDir::Pending(path));
continue;
} else if name.len() == 64 {
let hash_bytes = if let Ok(h) = hex::decode(&name) {
h
} else {
continue;
};
let mut hash = [0u8; 32];
hash.copy_from_slice(&hash_bytes[..]);
return Ok(Some(hash.into()));
}
}
}

View file

@ -698,7 +698,7 @@ impl AdminRpcHandler {
)))
}
} else {
launch_online_repair(self.garage.clone(), opt).await?;
launch_online_repair(self.garage.clone(), opt);
Ok(AdminRpc::Ok(format!(
"Repair launched on {:?}",
self.garage.system.id

View file

@ -13,7 +13,7 @@ use garage_util::error::Error;
use crate::*;
pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) -> Result<(), Error> {
pub fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) {
match opt.what {
RepairWhat::Tables => {
info!("Launching a full sync of tables");
@ -45,13 +45,14 @@ pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) -> Result
}
RepairWhat::Scrub { tranquility } => {
info!("Verifying integrity of stored blocks");
garage.background.spawn_worker(
garage_block::repair::ScrubWorker::new(garage.block_manager.clone(), tranquility)
.await?,
);
garage
.background
.spawn_worker(garage_block::repair::ScrubWorker::new(
garage.block_manager.clone(),
tranquility,
));
}
}
Ok(())
}
// ----