garage/src/block/repair.rs

764 lines
20 KiB
Rust

use core::ops::Bound;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use rand::Rng;
use tokio::fs;
use tokio::select;
use tokio::sync::mpsc;
use tokio::sync::watch;
use garage_util::background::*;
use garage_util::data::*;
use garage_util::error::*;
use garage_util::persister::PersisterShared;
use garage_util::time::*;
use garage_util::tranquilizer::Tranquilizer;
use crate::block::*;
use crate::manager::*;
// Full scrub every 25 days with a random element of 10 days mixed in below
const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 25);
// Scrub tranquility is initially set to 4, but can be changed in the CLI
// and the updated version is persisted over Garage restarts
const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
// ---- ---- ----
// FIRST KIND OF REPAIR: FINDING MISSING BLOCKS/USELESS BLOCKS
// This is a one-shot repair operation that can be launched,
// checks everything, and then exits.
// ---- ---- ----
pub struct RepairWorker {
manager: Arc<BlockManager>,
next_start: Option<Hash>,
block_iter: Option<BlockStoreIterator>,
}
impl RepairWorker {
pub fn new(manager: Arc<BlockManager>) -> Self {
Self {
manager,
next_start: None,
block_iter: None,
}
}
}
#[async_trait]
impl Worker for RepairWorker {
fn name(&self) -> String {
"Block repair worker".into()
}
fn status(&self) -> WorkerStatus {
match self.block_iter.as_ref() {
None => {
let idx_bytes = self
.next_start
.as_ref()
.map(|x| x.as_slice())
.unwrap_or(&[]);
let idx_bytes = if idx_bytes.len() > 4 {
&idx_bytes[..4]
} else {
idx_bytes
};
WorkerStatus {
progress: Some("0.00%".into()),
freeform: vec![format!(
"Currently in phase 1, iterator position: {}",
hex::encode(idx_bytes)
)],
..Default::default()
}
}
Some(bi) => WorkerStatus {
progress: Some(format!("{:.2}%", bi.progress() * 100.)),
freeform: vec!["Currently in phase 2".into()],
..Default::default()
},
}
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
match self.block_iter.as_mut() {
None => {
// Phase 1: Repair blocks from RC table.
// We have to do this complicated two-step process where we first read a bunch
// of hashes from the RC table, and then insert them in the to-resync queue,
// because of SQLite. Basically, as long as we have an iterator on a DB table,
// we can't do anything else on the DB. The naive approach (which we had previously)
// of just iterating on the RC table and inserting items one to one in the resync
// queue can't work here, it would just provoke a deadlock in the SQLite adapter code.
// This is mostly because the Rust bindings for SQLite assume a worst-case scenario
// where SQLite is not compiled in thread-safe mode, so we have to wrap everything
// in a mutex (see db/sqlite_adapter.rs and discussion in PR #322).
// TODO: maybe do this with tokio::task::spawn_blocking ?
let mut batch_of_hashes = vec![];
let start_bound = match self.next_start.as_ref() {
None => Bound::Unbounded,
Some(x) => Bound::Excluded(x.as_slice()),
};
for entry in self
.manager
.rc
.rc_table
.range::<&[u8], _>((start_bound, Bound::Unbounded))?
{
let (hash, _) = entry?;
let hash = Hash::try_from(&hash[..]).unwrap();
batch_of_hashes.push(hash);
if batch_of_hashes.len() >= 1000 {
break;
}
}
if batch_of_hashes.is_empty() {
// move on to phase 2
self.block_iter = Some(BlockStoreIterator::new(&self.manager));
return Ok(WorkerState::Busy);
}
for hash in batch_of_hashes.into_iter() {
self.manager
.resync
.put_to_resync(&hash, Duration::from_secs(0))?;
self.next_start = Some(hash)
}
Ok(WorkerState::Busy)
}
Some(bi) => {
// Phase 2: Repair blocks actually on disk
// Lists all blocks on disk and adds them to the resync queue.
// This allows us to find blocks we are storing but don't actually need,
// so that we can offload them if necessary and then delete them locally.
if let Some((_path, hash)) = bi.next().await? {
self.manager
.resync
.put_to_resync(&hash, Duration::from_secs(0))?;
Ok(WorkerState::Busy)
} else {
Ok(WorkerState::Done)
}
}
}
}
async fn wait_for_work(&mut self) -> WorkerState {
unreachable!()
}
}
// ---- ---- ----
// SECOND KIND OF REPAIR: SCRUBBING THE DATASTORE
// This is significantly more complex than the process above,
// as it is a continuously-running task that triggers automatically
// every SCRUB_INTERVAL, but can also be triggered manually
// and whose parameter (esp. speed) can be controlled at runtime.
// ---- ---- ----
mod v081 {
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct ScrubWorkerPersisted {
pub tranquility: u32,
pub(crate) time_last_complete_scrub: u64,
pub(crate) corruptions_detected: u64,
}
impl garage_util::migrate::InitialFormat for ScrubWorkerPersisted {}
}
mod v082 {
use garage_util::data::Hash;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use super::v081;
#[derive(Serialize, Deserialize)]
pub struct ScrubWorkerPersisted {
pub tranquility: u32,
pub(crate) time_last_complete_scrub: u64,
pub(crate) time_next_run_scrub: u64,
pub(crate) corruptions_detected: u64,
#[serde(default)]
pub(crate) checkpoint: Option<BlockStoreIterator>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct BlockStoreIterator {
pub todo: Vec<BsiTodo>,
}
#[derive(Serialize, Deserialize, Clone)]
pub enum BsiTodo {
Directory {
path: PathBuf,
progress_min: u64,
progress_max: u64,
},
File {
path: PathBuf,
hash: Hash,
progress: u64,
},
}
impl garage_util::migrate::Migrate for ScrubWorkerPersisted {
type Previous = v081::ScrubWorkerPersisted;
const VERSION_MARKER: &'static [u8] = b"G082bswp";
fn migrate(old: v081::ScrubWorkerPersisted) -> ScrubWorkerPersisted {
use crate::repair::randomize_next_scrub_run_time;
ScrubWorkerPersisted {
tranquility: old.tranquility,
time_last_complete_scrub: old.time_last_complete_scrub,
time_next_run_scrub: randomize_next_scrub_run_time(old.time_last_complete_scrub),
corruptions_detected: old.corruptions_detected,
checkpoint: None,
}
}
}
}
pub use v082::*;
pub struct ScrubWorker {
manager: Arc<BlockManager>,
rx_cmd: mpsc::Receiver<ScrubWorkerCommand>,
work: ScrubWorkerState,
tranquilizer: Tranquilizer,
persister: PersisterShared<ScrubWorkerPersisted>,
}
fn randomize_next_scrub_run_time(timestamp: u64) -> u64 {
// Take SCRUB_INTERVAL and mix in a random interval of 10 days to attempt to
// balance scrub load across different cluster nodes.
timestamp
+ SCRUB_INTERVAL
.saturating_add(Duration::from_secs(
rand::thread_rng().gen_range(0..3600 * 24 * 10),
))
.as_millis() as u64
}
impl Default for ScrubWorkerPersisted {
fn default() -> Self {
ScrubWorkerPersisted {
time_last_complete_scrub: 0,
time_next_run_scrub: randomize_next_scrub_run_time(now_msec()),
tranquility: INITIAL_SCRUB_TRANQUILITY,
corruptions_detected: 0,
checkpoint: None,
}
}
}
#[derive(Default)]
enum ScrubWorkerState {
Running {
iterator: BlockStoreIterator,
// time of the last checkpoint
t_cp: u64,
},
Paused {
iterator: BlockStoreIterator,
// time at which the scrub should be resumed
t_resume: u64,
},
#[default]
Finished,
}
#[derive(Debug)]
pub enum ScrubWorkerCommand {
Start,
Pause(Duration),
Resume,
Cancel,
}
impl ScrubWorker {
pub(crate) fn new(
manager: Arc<BlockManager>,
rx_cmd: mpsc::Receiver<ScrubWorkerCommand>,
persister: PersisterShared<ScrubWorkerPersisted>,
) -> Self {
let work = match persister.get_with(|x| x.checkpoint.clone()) {
None => ScrubWorkerState::Finished,
Some(iterator) => ScrubWorkerState::Running {
iterator,
t_cp: now_msec(),
},
};
Self {
manager,
rx_cmd,
work,
tranquilizer: Tranquilizer::new(30),
persister,
}
}
async fn handle_cmd(&mut self, cmd: ScrubWorkerCommand) {
match cmd {
ScrubWorkerCommand::Start => {
self.work = match std::mem::take(&mut self.work) {
ScrubWorkerState::Finished => {
info!("Scrub worker initializing, now performing datastore scrub");
let iterator = BlockStoreIterator::new(&self.manager);
if let Err(e) = self
.persister
.set_with(|x| x.checkpoint = Some(iterator.clone()))
{
error!("Could not save scrub checkpoint: {}", e);
}
ScrubWorkerState::Running {
iterator,
t_cp: now_msec(),
}
}
work => {
error!("Cannot start scrub worker: already running!");
work
}
};
}
ScrubWorkerCommand::Pause(dur) => {
self.work = match std::mem::take(&mut self.work) {
ScrubWorkerState::Running { iterator, .. }
| ScrubWorkerState::Paused { iterator, .. } => {
if let Err(e) = self
.persister
.set_with(|x| x.checkpoint = Some(iterator.clone()))
{
error!("Could not save scrub checkpoint: {}", e);
}
ScrubWorkerState::Paused {
iterator,
t_resume: now_msec() + dur.as_millis() as u64,
}
}
work => {
error!("Cannot pause scrub worker: not running!");
work
}
};
}
ScrubWorkerCommand::Resume => {
self.work = match std::mem::take(&mut self.work) {
ScrubWorkerState::Paused { iterator, .. } => ScrubWorkerState::Running {
iterator,
t_cp: now_msec(),
},
work => {
error!("Cannot resume scrub worker: not paused!");
work
}
};
}
ScrubWorkerCommand::Cancel => {
self.work = match std::mem::take(&mut self.work) {
ScrubWorkerState::Running { .. } | ScrubWorkerState::Paused { .. } => {
if let Err(e) = self.persister.set_with(|x| x.checkpoint = None) {
error!("Could not save scrub checkpoint: {}", e);
}
ScrubWorkerState::Finished
}
work => {
error!("Cannot cancel scrub worker: not running!");
work
}
}
}
}
}
}
#[async_trait]
impl Worker for ScrubWorker {
fn name(&self) -> String {
"Block scrub worker".into()
}
fn status(&self) -> WorkerStatus {
let (corruptions_detected, tranquility, time_last_complete_scrub, time_next_run_scrub) =
self.persister.get_with(|p| {
(
p.corruptions_detected,
p.tranquility,
p.time_last_complete_scrub,
p.time_next_run_scrub,
)
});
let mut s = WorkerStatus {
persistent_errors: Some(corruptions_detected),
tranquility: Some(tranquility),
..Default::default()
};
match &self.work {
ScrubWorkerState::Running { iterator, .. } => {
s.progress = Some(format!("{:.2}%", iterator.progress() * 100.));
}
ScrubWorkerState::Paused { iterator, t_resume } => {
s.progress = Some(format!("{:.2}%", iterator.progress() * 100.));
s.freeform = vec![format!(
"Scrub paused, resumes at {}",
msec_to_rfc3339(*t_resume)
)];
}
ScrubWorkerState::Finished => {
s.freeform = vec![
format!(
"Last scrub completed at {}",
msec_to_rfc3339(time_last_complete_scrub),
),
format!(
"Next scrub scheduled for {}",
msec_to_rfc3339(time_next_run_scrub)
),
];
}
}
s
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
match self.rx_cmd.try_recv() {
Ok(cmd) => self.handle_cmd(cmd).await,
Err(mpsc::error::TryRecvError::Disconnected) => return Ok(WorkerState::Done),
Err(mpsc::error::TryRecvError::Empty) => (),
};
match &mut self.work {
ScrubWorkerState::Running { iterator, t_cp } => {
self.tranquilizer.reset();
let now = now_msec();
if let Some((_path, hash)) = iterator.next().await? {
match self.manager.read_block(&hash).await {
Err(Error::CorruptData(_)) => {
error!("Found corrupt data block during scrub: {:?}", hash);
self.persister.set_with(|p| p.corruptions_detected += 1)?;
}
Err(e) => return Err(e),
_ => (),
};
if now - *t_cp > 60 * 1000 {
self.persister
.set_with(|p| p.checkpoint = Some(iterator.clone()))?;
*t_cp = now;
}
Ok(self
.tranquilizer
.tranquilize_worker(self.persister.get_with(|p| p.tranquility)))
} else {
let next_scrub_timestamp = randomize_next_scrub_run_time(now);
self.persister.set_with(|p| {
p.time_last_complete_scrub = now;
p.time_next_run_scrub = next_scrub_timestamp;
p.checkpoint = None;
})?;
self.work = ScrubWorkerState::Finished;
self.tranquilizer.clear();
info!(
"Datastore scrub completed, next scrub scheduled for {}",
msec_to_rfc3339(next_scrub_timestamp)
);
Ok(WorkerState::Idle)
}
}
_ => Ok(WorkerState::Idle),
}
}
async fn wait_for_work(&mut self) -> WorkerState {
let (wait_until, command) = match &self.work {
ScrubWorkerState::Running { .. } => return WorkerState::Busy,
ScrubWorkerState::Paused { t_resume, .. } => (*t_resume, ScrubWorkerCommand::Resume),
ScrubWorkerState::Finished => (
self.persister.get_with(|p| p.time_next_run_scrub),
ScrubWorkerCommand::Start,
),
};
let now = now_msec();
if now >= wait_until {
self.handle_cmd(command).await;
return WorkerState::Busy;
}
let delay = Duration::from_millis(wait_until - now);
select! {
_ = tokio::time::sleep(delay) => self.handle_cmd(command).await,
cmd = self.rx_cmd.recv() => if let Some(cmd) = cmd {
self.handle_cmd(cmd).await;
} else {
return WorkerState::Done;
}
}
match &self.work {
ScrubWorkerState::Running { .. } => WorkerState::Busy,
_ => WorkerState::Idle,
}
}
}
// ---- ---- ----
// THIRD KIND OF REPAIR: REBALANCING DATA BLOCKS
// between multiple storage locations.
// This is a one-shot repair operation that can be launched,
// checks everything, and then exits.
// ---- ---- ----
pub struct RebalanceWorker {
manager: Arc<BlockManager>,
block_iter: BlockStoreIterator,
t_started: u64,
t_finished: Option<u64>,
moved: usize,
moved_bytes: u64,
}
impl RebalanceWorker {
pub fn new(manager: Arc<BlockManager>) -> Self {
let block_iter = BlockStoreIterator::new(&manager);
Self {
manager,
block_iter,
t_started: now_msec(),
t_finished: None,
moved: 0,
moved_bytes: 0,
}
}
}
#[async_trait]
impl Worker for RebalanceWorker {
fn name(&self) -> String {
"Block rebalance worker".into()
}
fn status(&self) -> WorkerStatus {
let t_cur = self.t_finished.unwrap_or_else(|| now_msec());
let rate = self.moved_bytes / std::cmp::max(1, (t_cur - self.t_started) / 1000);
let mut freeform = vec![
format!("Blocks moved: {}", self.moved),
format!(
"Bytes moved: {} ({}/s)",
bytesize::ByteSize::b(self.moved_bytes),
bytesize::ByteSize::b(rate)
),
format!("Started: {}", msec_to_rfc3339(self.t_started)),
];
if let Some(t_fin) = self.t_finished {
freeform.push(format!("Finished: {}", msec_to_rfc3339(t_fin)))
}
WorkerStatus {
progress: Some(format!("{:.2}%", self.block_iter.progress() * 100.)),
freeform,
..Default::default()
}
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
if let Some((path, hash)) = self.block_iter.next().await? {
let prim_loc = self.manager.data_layout.load().primary_block_dir(&hash);
if path.ancestors().all(|x| x != prim_loc) {
let block_path = match path.extension() {
None => DataBlockPath::plain(path.clone()),
Some(x) if x.to_str() == Some("zst") => DataBlockPath::compressed(path.clone()),
_ => {
warn!("not rebalancing file: {}", path.to_string_lossy());
return Ok(WorkerState::Busy);
}
};
// block is not in its primary location,
// move it there (reading and re-writing does the trick)
debug!("rebalance: moving block {:?} => {:?}", block_path, prim_loc);
let block_len = self.manager.fix_block_location(&hash, block_path).await?;
self.moved += 1;
self.moved_bytes += block_len as u64;
}
Ok(WorkerState::Busy)
} else {
// all blocks are in their primary location:
// - the ones we moved now are
// - the ones written in the meantime always were, because we only
// write to primary locations
// so we can safely remove all secondary locations from the data layout
let new_layout = self
.manager
.data_layout
.load_full()
.without_secondary_locations();
self.manager
.data_layout_persister
.save_async(&new_layout)
.await?;
self.manager.data_layout.store(Arc::new(new_layout));
self.t_finished = Some(now_msec());
Ok(WorkerState::Done)
}
}
async fn wait_for_work(&mut self) -> WorkerState {
unreachable!()
}
}
// ---- ---- ----
// UTILITY FOR ENUMERATING THE BLOCK STORE
// ---- ---- ----
const PROGRESS_FP: u64 = 1_000_000_000;
impl BlockStoreIterator {
fn new(manager: &BlockManager) -> Self {
let data_layout = manager.data_layout.load_full();
let mut dir_cap = vec![0; data_layout.data_dirs.len()];
for prim in data_layout.part_prim.iter() {
dir_cap[*prim as usize] += 1;
}
for sec_vec in data_layout.part_sec.iter() {
for sec in sec_vec.iter() {
dir_cap[*sec as usize] += 1;
}
}
let sum_cap = dir_cap.iter().sum::<usize>() as u64;
let mut cum_cap = 0;
let mut todo = vec![];
for (dir, cap) in data_layout.data_dirs.iter().zip(dir_cap.into_iter()) {
let progress_min = (cum_cap * PROGRESS_FP) / sum_cap;
let progress_max = ((cum_cap + cap as u64) * PROGRESS_FP) / sum_cap;
cum_cap += cap as u64;
todo.push(BsiTodo::Directory {
path: dir.path.clone(),
progress_min,
progress_max,
});
}
// entries are processed back-to-front (because of .pop()),
// so reverse entries to process them in increasing progress bounds
todo.reverse();
let ret = Self { todo };
debug_assert!(ret.progress_invariant());
ret
}
/// Returns progress done, between 0 and 1
fn progress(&self) -> f32 {
self.todo
.last()
.map(|x| match x {
BsiTodo::Directory { progress_min, .. } => *progress_min,
BsiTodo::File { progress, .. } => *progress,
})
.map(|x| x as f32 / PROGRESS_FP as f32)
.unwrap_or(1.0)
}
async fn next(&mut self) -> Result<Option<(PathBuf, Hash)>, Error> {
loop {
match self.todo.pop() {
None => return Ok(None),
Some(BsiTodo::Directory {
path,
progress_min,
progress_max,
}) => {
let istart = self.todo.len();
let mut reader = fs::read_dir(&path).await?;
while let Some(ent) = reader.next_entry().await? {
let name = if let Ok(n) = ent.file_name().into_string() {
n
} else {
continue;
};
let ft = ent.file_type().await?;
if ft.is_dir() && hex::decode(&name).is_ok() {
self.todo.push(BsiTodo::Directory {
path: ent.path(),
progress_min: 0,
progress_max: 0,
});
} else if ft.is_file() {
let filename = name.split_once('.').map(|(f, _)| f).unwrap_or(&name);
if filename.len() == 64 {
if let Ok(h) = hex::decode(filename) {
let mut hash = [0u8; 32];
hash.copy_from_slice(&h);
self.todo.push(BsiTodo::File {
path: ent.path(),
hash: hash.into(),
progress: 0,
});
}
}
}
}
let count = self.todo.len() - istart;
for (i, ent) in self.todo[istart..].iter_mut().enumerate() {
let p1 = progress_min
+ ((progress_max - progress_min) * i as u64) / count as u64;
let p2 = progress_min
+ ((progress_max - progress_min) * (i + 1) as u64) / count as u64;
match ent {
BsiTodo::Directory {
progress_min,
progress_max,
..
} => {
*progress_min = p1;
*progress_max = p2;
}
BsiTodo::File { progress, .. } => {
*progress = p1;
}
}
}
self.todo[istart..].reverse();
debug_assert!(self.progress_invariant());
}
Some(BsiTodo::File { path, hash, .. }) => {
return Ok(Some((path, hash)));
}
}
}
}
// for debug_assert!
fn progress_invariant(&self) -> bool {
let iter = self.todo.iter().map(|x| match x {
BsiTodo::Directory { progress_min, .. } => progress_min,
BsiTodo::File { progress, .. } => progress,
});
let iter_1 = iter.clone().skip(1);
iter.zip(iter_1).all(|(prev, next)| prev >= next)
}
}