2022-07-08 11:30:26 +00:00
|
|
|
use core::ops::Bound;
|
|
|
|
use std::path::PathBuf;
|
|
|
|
use std::sync::Arc;
|
|
|
|
use std::time::Duration;
|
|
|
|
|
|
|
|
use async_trait::async_trait;
|
2023-03-04 16:16:10 +00:00
|
|
|
use rand::Rng;
|
2022-07-08 11:30:26 +00:00
|
|
|
use tokio::fs;
|
|
|
|
use tokio::select;
|
|
|
|
use tokio::sync::mpsc;
|
|
|
|
use tokio::sync::watch;
|
|
|
|
|
|
|
|
use garage_util::background::*;
|
|
|
|
use garage_util::data::*;
|
|
|
|
use garage_util::error::*;
|
2023-01-04 12:07:13 +00:00
|
|
|
use garage_util::persister::PersisterShared;
|
2022-07-08 11:30:26 +00:00
|
|
|
use garage_util::time::*;
|
|
|
|
use garage_util::tranquilizer::Tranquilizer;
|
|
|
|
|
2023-09-07 13:30:56 +00:00
|
|
|
use crate::block::*;
|
2022-07-08 11:30:26 +00:00
|
|
|
use crate::manager::*;
|
|
|
|
|
2023-03-04 16:16:10 +00:00
|
|
|
// Full scrub every 25 days with a random element of 10 days mixed in below
|
|
|
|
const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 25);
|
2022-09-02 13:34:21 +00:00
|
|
|
// Scrub tranquility is initially set to 4, but can be changed in the CLI
|
|
|
|
// and the updated version is persisted over Garage restarts
|
|
|
|
const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
|
|
|
|
|
|
|
|
// ---- ---- ----
|
|
|
|
// FIRST KIND OF REPAIR: FINDING MISSING BLOCKS/USELESS BLOCKS
|
|
|
|
// This is a one-shot repair operation that can be launched,
|
|
|
|
// checks everything, and then exits.
|
|
|
|
// ---- ---- ----
|
2022-07-08 11:30:26 +00:00
|
|
|
|
|
|
|
pub struct RepairWorker {
|
|
|
|
manager: Arc<BlockManager>,
|
|
|
|
next_start: Option<Hash>,
|
|
|
|
block_iter: Option<BlockStoreIterator>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl RepairWorker {
|
|
|
|
pub fn new(manager: Arc<BlockManager>) -> Self {
|
|
|
|
Self {
|
|
|
|
manager,
|
|
|
|
next_start: None,
|
|
|
|
block_iter: None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[async_trait]
|
|
|
|
impl Worker for RepairWorker {
|
|
|
|
fn name(&self) -> String {
|
|
|
|
"Block repair worker".into()
|
|
|
|
}
|
|
|
|
|
2022-12-12 16:16:49 +00:00
|
|
|
fn status(&self) -> WorkerStatus {
|
2022-07-08 11:30:26 +00:00
|
|
|
match self.block_iter.as_ref() {
|
|
|
|
None => {
|
|
|
|
let idx_bytes = self
|
|
|
|
.next_start
|
|
|
|
.as_ref()
|
|
|
|
.map(|x| x.as_slice())
|
|
|
|
.unwrap_or(&[]);
|
|
|
|
let idx_bytes = if idx_bytes.len() > 4 {
|
|
|
|
&idx_bytes[..4]
|
|
|
|
} else {
|
|
|
|
idx_bytes
|
|
|
|
};
|
2022-12-12 16:16:49 +00:00
|
|
|
WorkerStatus {
|
2022-12-13 11:24:30 +00:00
|
|
|
progress: Some("0.00%".into()),
|
|
|
|
freeform: vec![format!(
|
|
|
|
"Currently in phase 1, iterator position: {}",
|
|
|
|
hex::encode(idx_bytes)
|
|
|
|
)],
|
2022-12-12 16:16:49 +00:00
|
|
|
..Default::default()
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
2022-12-12 16:16:49 +00:00
|
|
|
Some(bi) => WorkerStatus {
|
|
|
|
progress: Some(format!("{:.2}%", bi.progress() * 100.)),
|
2022-12-13 11:24:30 +00:00
|
|
|
freeform: vec!["Currently in phase 2".into()],
|
2022-12-12 16:16:49 +00:00
|
|
|
..Default::default()
|
|
|
|
},
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
|
|
|
match self.block_iter.as_mut() {
|
|
|
|
None => {
|
|
|
|
// Phase 1: Repair blocks from RC table.
|
|
|
|
|
|
|
|
// We have to do this complicated two-step process where we first read a bunch
|
|
|
|
// of hashes from the RC table, and then insert them in the to-resync queue,
|
|
|
|
// because of SQLite. Basically, as long as we have an iterator on a DB table,
|
|
|
|
// we can't do anything else on the DB. The naive approach (which we had previously)
|
|
|
|
// of just iterating on the RC table and inserting items one to one in the resync
|
|
|
|
// queue can't work here, it would just provoke a deadlock in the SQLite adapter code.
|
|
|
|
// This is mostly because the Rust bindings for SQLite assume a worst-case scenario
|
|
|
|
// where SQLite is not compiled in thread-safe mode, so we have to wrap everything
|
|
|
|
// in a mutex (see db/sqlite_adapter.rs and discussion in PR #322).
|
|
|
|
// TODO: maybe do this with tokio::task::spawn_blocking ?
|
|
|
|
let mut batch_of_hashes = vec![];
|
|
|
|
let start_bound = match self.next_start.as_ref() {
|
|
|
|
None => Bound::Unbounded,
|
|
|
|
Some(x) => Bound::Excluded(x.as_slice()),
|
|
|
|
};
|
|
|
|
for entry in self
|
|
|
|
.manager
|
|
|
|
.rc
|
|
|
|
.rc
|
|
|
|
.range::<&[u8], _>((start_bound, Bound::Unbounded))?
|
|
|
|
{
|
|
|
|
let (hash, _) = entry?;
|
|
|
|
let hash = Hash::try_from(&hash[..]).unwrap();
|
|
|
|
batch_of_hashes.push(hash);
|
|
|
|
if batch_of_hashes.len() >= 1000 {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if batch_of_hashes.is_empty() {
|
|
|
|
// move on to phase 2
|
|
|
|
self.block_iter = Some(BlockStoreIterator::new(&self.manager));
|
|
|
|
return Ok(WorkerState::Busy);
|
|
|
|
}
|
|
|
|
|
|
|
|
for hash in batch_of_hashes.into_iter() {
|
2022-09-02 14:47:15 +00:00
|
|
|
self.manager
|
|
|
|
.resync
|
|
|
|
.put_to_resync(&hash, Duration::from_secs(0))?;
|
2022-07-08 11:30:26 +00:00
|
|
|
self.next_start = Some(hash)
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(WorkerState::Busy)
|
|
|
|
}
|
|
|
|
Some(bi) => {
|
|
|
|
// Phase 2: Repair blocks actually on disk
|
|
|
|
// Lists all blocks on disk and adds them to the resync queue.
|
|
|
|
// This allows us to find blocks we are storing but don't actually need,
|
|
|
|
// so that we can offload them if necessary and then delete them locally.
|
2023-09-04 12:49:49 +00:00
|
|
|
if let Some((_path, hash)) = bi.next().await? {
|
2022-09-02 14:47:15 +00:00
|
|
|
self.manager
|
|
|
|
.resync
|
|
|
|
.put_to_resync(&hash, Duration::from_secs(0))?;
|
2022-07-08 11:30:26 +00:00
|
|
|
Ok(WorkerState::Busy)
|
|
|
|
} else {
|
|
|
|
Ok(WorkerState::Done)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-14 14:25:29 +00:00
|
|
|
async fn wait_for_work(&mut self) -> WorkerState {
|
2022-07-08 11:30:26 +00:00
|
|
|
unreachable!()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-02 13:34:21 +00:00
|
|
|
// ---- ---- ----
|
|
|
|
// SECOND KIND OF REPAIR: SCRUBBING THE DATASTORE
|
|
|
|
// This is significantly more complex than the process above,
|
|
|
|
// as it is a continuously-running task that triggers automatically
|
|
|
|
// every SCRUB_INTERVAL, but can also be triggered manually
|
|
|
|
// and whose parameter (esp. speed) can be controlled at runtime.
|
|
|
|
// ---- ---- ----
|
2022-07-08 11:30:26 +00:00
|
|
|
|
2023-03-09 15:34:14 +00:00
|
|
|
mod v081 {
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize)]
|
|
|
|
pub struct ScrubWorkerPersisted {
|
|
|
|
pub tranquility: u32,
|
|
|
|
pub(crate) time_last_complete_scrub: u64,
|
|
|
|
pub(crate) corruptions_detected: u64,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl garage_util::migrate::InitialFormat for ScrubWorkerPersisted {}
|
|
|
|
}
|
|
|
|
|
|
|
|
mod v082 {
|
2023-09-05 15:37:45 +00:00
|
|
|
use garage_util::data::Hash;
|
2023-03-09 15:34:14 +00:00
|
|
|
use serde::{Deserialize, Serialize};
|
2023-09-05 15:37:45 +00:00
|
|
|
use std::path::PathBuf;
|
2023-03-09 15:34:14 +00:00
|
|
|
|
|
|
|
use super::v081;
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize)]
|
|
|
|
pub struct ScrubWorkerPersisted {
|
|
|
|
pub tranquility: u32,
|
|
|
|
pub(crate) time_last_complete_scrub: u64,
|
|
|
|
pub(crate) time_next_run_scrub: u64,
|
|
|
|
pub(crate) corruptions_detected: u64,
|
2023-09-05 15:37:45 +00:00
|
|
|
#[serde(default)]
|
|
|
|
pub(crate) checkpoint: Option<BlockStoreIterator>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize, Clone)]
|
|
|
|
pub struct BlockStoreIterator {
|
|
|
|
pub todo: Vec<BsiTodo>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize, Clone)]
|
|
|
|
pub enum BsiTodo {
|
|
|
|
Directory {
|
|
|
|
path: PathBuf,
|
|
|
|
progress_min: u64,
|
|
|
|
progress_max: u64,
|
|
|
|
},
|
|
|
|
File {
|
|
|
|
path: PathBuf,
|
|
|
|
hash: Hash,
|
|
|
|
progress: u64,
|
|
|
|
},
|
2023-03-09 15:34:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl garage_util::migrate::Migrate for ScrubWorkerPersisted {
|
|
|
|
type Previous = v081::ScrubWorkerPersisted;
|
2023-03-13 17:44:09 +00:00
|
|
|
const VERSION_MARKER: &'static [u8] = b"G082bswp";
|
2023-03-09 15:34:14 +00:00
|
|
|
|
|
|
|
fn migrate(old: v081::ScrubWorkerPersisted) -> ScrubWorkerPersisted {
|
|
|
|
use crate::repair::randomize_next_scrub_run_time;
|
|
|
|
|
|
|
|
ScrubWorkerPersisted {
|
|
|
|
tranquility: old.tranquility,
|
|
|
|
time_last_complete_scrub: old.time_last_complete_scrub,
|
2023-03-09 16:32:22 +00:00
|
|
|
time_next_run_scrub: randomize_next_scrub_run_time(old.time_last_complete_scrub),
|
2023-03-09 15:34:14 +00:00
|
|
|
corruptions_detected: old.corruptions_detected,
|
2023-09-05 15:37:45 +00:00
|
|
|
checkpoint: None,
|
2023-03-09 15:34:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub use v082::*;
|
|
|
|
|
2022-07-08 11:30:26 +00:00
|
|
|
pub struct ScrubWorker {
|
|
|
|
manager: Arc<BlockManager>,
|
|
|
|
rx_cmd: mpsc::Receiver<ScrubWorkerCommand>,
|
|
|
|
|
|
|
|
work: ScrubWorkerState,
|
|
|
|
tranquilizer: Tranquilizer,
|
|
|
|
|
2023-01-04 12:07:13 +00:00
|
|
|
persister: PersisterShared<ScrubWorkerPersisted>,
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
|
2023-03-09 16:32:22 +00:00
|
|
|
fn randomize_next_scrub_run_time(timestamp: u64) -> u64 {
|
2023-03-04 16:16:10 +00:00
|
|
|
// Take SCRUB_INTERVAL and mix in a random interval of 10 days to attempt to
|
|
|
|
// balance scrub load across different cluster nodes.
|
|
|
|
|
2023-05-09 19:49:34 +00:00
|
|
|
timestamp
|
2023-03-04 16:16:10 +00:00
|
|
|
+ SCRUB_INTERVAL
|
|
|
|
.saturating_add(Duration::from_secs(
|
|
|
|
rand::thread_rng().gen_range(0..3600 * 24 * 10),
|
|
|
|
))
|
2023-05-09 19:49:34 +00:00
|
|
|
.as_millis() as u64
|
2023-03-04 16:16:10 +00:00
|
|
|
}
|
|
|
|
|
2023-01-04 12:07:13 +00:00
|
|
|
impl Default for ScrubWorkerPersisted {
|
|
|
|
fn default() -> Self {
|
|
|
|
ScrubWorkerPersisted {
|
|
|
|
time_last_complete_scrub: 0,
|
2023-03-09 16:32:22 +00:00
|
|
|
time_next_run_scrub: randomize_next_scrub_run_time(now_msec()),
|
2023-01-04 12:07:13 +00:00
|
|
|
tranquility: INITIAL_SCRUB_TRANQUILITY,
|
|
|
|
corruptions_detected: 0,
|
2023-09-05 15:37:45 +00:00
|
|
|
checkpoint: None,
|
2023-01-04 12:07:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
|
2023-05-09 19:49:34 +00:00
|
|
|
#[derive(Default)]
|
2022-07-08 11:30:26 +00:00
|
|
|
enum ScrubWorkerState {
|
2023-09-05 15:37:45 +00:00
|
|
|
Running {
|
|
|
|
iterator: BlockStoreIterator,
|
|
|
|
// time of the last checkpoint
|
|
|
|
t_cp: u64,
|
|
|
|
},
|
|
|
|
Paused {
|
|
|
|
iterator: BlockStoreIterator,
|
|
|
|
// time at which the scrub should be resumed
|
|
|
|
t_resume: u64,
|
|
|
|
},
|
2023-05-09 19:49:34 +00:00
|
|
|
#[default]
|
2022-07-08 11:30:26 +00:00
|
|
|
Finished,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
pub enum ScrubWorkerCommand {
|
|
|
|
Start,
|
|
|
|
Pause(Duration),
|
|
|
|
Resume,
|
|
|
|
Cancel,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ScrubWorker {
|
2023-01-04 12:07:13 +00:00
|
|
|
pub(crate) fn new(
|
|
|
|
manager: Arc<BlockManager>,
|
|
|
|
rx_cmd: mpsc::Receiver<ScrubWorkerCommand>,
|
|
|
|
persister: PersisterShared<ScrubWorkerPersisted>,
|
|
|
|
) -> Self {
|
2023-09-05 15:37:45 +00:00
|
|
|
let work = match persister.get_with(|x| x.checkpoint.clone()) {
|
|
|
|
None => ScrubWorkerState::Finished,
|
|
|
|
Some(iterator) => ScrubWorkerState::Running {
|
|
|
|
iterator,
|
|
|
|
t_cp: now_msec(),
|
|
|
|
},
|
|
|
|
};
|
2022-07-08 11:30:26 +00:00
|
|
|
Self {
|
|
|
|
manager,
|
|
|
|
rx_cmd,
|
2023-09-05 15:37:45 +00:00
|
|
|
work,
|
2022-07-08 11:30:26 +00:00
|
|
|
tranquilizer: Tranquilizer::new(30),
|
|
|
|
persister,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn handle_cmd(&mut self, cmd: ScrubWorkerCommand) {
|
|
|
|
match cmd {
|
|
|
|
ScrubWorkerCommand::Start => {
|
|
|
|
self.work = match std::mem::take(&mut self.work) {
|
|
|
|
ScrubWorkerState::Finished => {
|
2023-04-23 18:32:28 +00:00
|
|
|
info!("Scrub worker initializing, now performing datastore scrub");
|
2022-07-08 11:30:26 +00:00
|
|
|
let iterator = BlockStoreIterator::new(&self.manager);
|
2023-09-05 15:37:45 +00:00
|
|
|
if let Err(e) = self
|
|
|
|
.persister
|
|
|
|
.set_with(|x| x.checkpoint = Some(iterator.clone()))
|
|
|
|
{
|
|
|
|
error!("Could not save scrub checkpoint: {}", e);
|
|
|
|
}
|
|
|
|
ScrubWorkerState::Running {
|
|
|
|
iterator,
|
|
|
|
t_cp: now_msec(),
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
work => {
|
|
|
|
error!("Cannot start scrub worker: already running!");
|
|
|
|
work
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
ScrubWorkerCommand::Pause(dur) => {
|
|
|
|
self.work = match std::mem::take(&mut self.work) {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Running { iterator, .. }
|
|
|
|
| ScrubWorkerState::Paused { iterator, .. } => {
|
|
|
|
if let Err(e) = self
|
|
|
|
.persister
|
|
|
|
.set_with(|x| x.checkpoint = Some(iterator.clone()))
|
|
|
|
{
|
|
|
|
error!("Could not save scrub checkpoint: {}", e);
|
|
|
|
}
|
|
|
|
ScrubWorkerState::Paused {
|
|
|
|
iterator,
|
|
|
|
t_resume: now_msec() + dur.as_millis() as u64,
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
work => {
|
|
|
|
error!("Cannot pause scrub worker: not running!");
|
|
|
|
work
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
ScrubWorkerCommand::Resume => {
|
|
|
|
self.work = match std::mem::take(&mut self.work) {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Paused { iterator, .. } => ScrubWorkerState::Running {
|
|
|
|
iterator,
|
|
|
|
t_cp: now_msec(),
|
|
|
|
},
|
2022-07-08 11:30:26 +00:00
|
|
|
work => {
|
|
|
|
error!("Cannot resume scrub worker: not paused!");
|
|
|
|
work
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
ScrubWorkerCommand::Cancel => {
|
|
|
|
self.work = match std::mem::take(&mut self.work) {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Running { .. } | ScrubWorkerState::Paused { .. } => {
|
2023-09-11 10:10:48 +00:00
|
|
|
if let Err(e) = self.persister.set_with(|x| x.checkpoint = None) {
|
|
|
|
error!("Could not save scrub checkpoint: {}", e);
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
ScrubWorkerState::Finished
|
|
|
|
}
|
|
|
|
work => {
|
|
|
|
error!("Cannot cancel scrub worker: not running!");
|
|
|
|
work
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[async_trait]
|
|
|
|
impl Worker for ScrubWorker {
|
|
|
|
fn name(&self) -> String {
|
|
|
|
"Block scrub worker".into()
|
|
|
|
}
|
|
|
|
|
2022-12-12 16:16:49 +00:00
|
|
|
fn status(&self) -> WorkerStatus {
|
2023-03-04 16:16:10 +00:00
|
|
|
let (corruptions_detected, tranquility, time_last_complete_scrub, time_next_run_scrub) =
|
2023-01-04 12:07:13 +00:00
|
|
|
self.persister.get_with(|p| {
|
|
|
|
(
|
|
|
|
p.corruptions_detected,
|
|
|
|
p.tranquility,
|
|
|
|
p.time_last_complete_scrub,
|
2023-03-04 16:16:10 +00:00
|
|
|
p.time_next_run_scrub,
|
2023-01-04 12:07:13 +00:00
|
|
|
)
|
|
|
|
});
|
|
|
|
|
2022-12-12 16:16:49 +00:00
|
|
|
let mut s = WorkerStatus {
|
2023-01-04 12:07:13 +00:00
|
|
|
persistent_errors: Some(corruptions_detected),
|
|
|
|
tranquility: Some(tranquility),
|
2022-12-12 16:16:49 +00:00
|
|
|
..Default::default()
|
|
|
|
};
|
|
|
|
match &self.work {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Running { iterator, .. } => {
|
|
|
|
s.progress = Some(format!("{:.2}%", iterator.progress() * 100.));
|
2022-12-12 16:16:49 +00:00
|
|
|
}
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Paused { iterator, t_resume } => {
|
|
|
|
s.progress = Some(format!("{:.2}%", iterator.progress() * 100.));
|
|
|
|
s.freeform = vec![format!(
|
|
|
|
"Scrub paused, resumes at {}",
|
|
|
|
msec_to_rfc3339(*t_resume)
|
|
|
|
)];
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
2022-12-12 16:16:49 +00:00
|
|
|
ScrubWorkerState::Finished => {
|
2023-03-04 16:16:10 +00:00
|
|
|
s.freeform = vec![
|
|
|
|
format!(
|
|
|
|
"Last scrub completed at {}",
|
|
|
|
msec_to_rfc3339(time_last_complete_scrub),
|
|
|
|
),
|
|
|
|
format!(
|
|
|
|
"Next scrub scheduled for {}",
|
|
|
|
msec_to_rfc3339(time_next_run_scrub)
|
|
|
|
),
|
|
|
|
];
|
2022-12-12 16:16:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
s
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
|
|
|
match self.rx_cmd.try_recv() {
|
|
|
|
Ok(cmd) => self.handle_cmd(cmd).await,
|
|
|
|
Err(mpsc::error::TryRecvError::Disconnected) => return Ok(WorkerState::Done),
|
|
|
|
Err(mpsc::error::TryRecvError::Empty) => (),
|
|
|
|
};
|
|
|
|
|
|
|
|
match &mut self.work {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Running { iterator, t_cp } => {
|
2022-07-08 11:30:26 +00:00
|
|
|
self.tranquilizer.reset();
|
2023-09-05 15:37:45 +00:00
|
|
|
let now = now_msec();
|
|
|
|
|
|
|
|
if let Some((_path, hash)) = iterator.next().await? {
|
2022-07-08 11:30:26 +00:00
|
|
|
match self.manager.read_block(&hash).await {
|
|
|
|
Err(Error::CorruptData(_)) => {
|
|
|
|
error!("Found corrupt data block during scrub: {:?}", hash);
|
2023-01-04 12:07:13 +00:00
|
|
|
self.persister.set_with(|p| p.corruptions_detected += 1)?;
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
Err(e) => return Err(e),
|
|
|
|
_ => (),
|
|
|
|
};
|
2023-09-05 15:37:45 +00:00
|
|
|
|
|
|
|
if now - *t_cp > 60 * 1000 {
|
|
|
|
self.persister
|
|
|
|
.set_with(|p| p.checkpoint = Some(iterator.clone()))?;
|
|
|
|
*t_cp = now;
|
|
|
|
}
|
|
|
|
|
2022-07-08 11:30:26 +00:00
|
|
|
Ok(self
|
|
|
|
.tranquilizer
|
2023-01-04 12:07:13 +00:00
|
|
|
.tranquilize_worker(self.persister.get_with(|p| p.tranquility)))
|
2022-07-08 11:30:26 +00:00
|
|
|
} else {
|
2023-04-23 18:32:28 +00:00
|
|
|
let next_scrub_timestamp = randomize_next_scrub_run_time(now);
|
|
|
|
|
2023-03-04 16:16:10 +00:00
|
|
|
self.persister.set_with(|p| {
|
2023-04-23 18:32:28 +00:00
|
|
|
p.time_last_complete_scrub = now;
|
|
|
|
p.time_next_run_scrub = next_scrub_timestamp;
|
2023-09-05 15:37:45 +00:00
|
|
|
p.checkpoint = None;
|
2023-03-04 16:16:10 +00:00
|
|
|
})?;
|
2022-07-08 11:30:26 +00:00
|
|
|
self.work = ScrubWorkerState::Finished;
|
|
|
|
self.tranquilizer.clear();
|
2023-04-23 18:32:28 +00:00
|
|
|
|
|
|
|
info!(
|
|
|
|
"Datastore scrub completed, next scrub scheduled for {}",
|
|
|
|
msec_to_rfc3339(next_scrub_timestamp)
|
|
|
|
);
|
|
|
|
|
2022-07-08 11:30:26 +00:00
|
|
|
Ok(WorkerState::Idle)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => Ok(WorkerState::Idle),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-14 14:25:29 +00:00
|
|
|
async fn wait_for_work(&mut self) -> WorkerState {
|
2022-07-08 11:30:26 +00:00
|
|
|
let (wait_until, command) = match &self.work {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Running { .. } => return WorkerState::Busy,
|
|
|
|
ScrubWorkerState::Paused { t_resume, .. } => (*t_resume, ScrubWorkerCommand::Resume),
|
2022-07-08 11:30:26 +00:00
|
|
|
ScrubWorkerState::Finished => (
|
2023-03-04 16:16:10 +00:00
|
|
|
self.persister.get_with(|p| p.time_next_run_scrub),
|
2022-07-08 11:30:26 +00:00
|
|
|
ScrubWorkerCommand::Start,
|
|
|
|
),
|
|
|
|
};
|
|
|
|
|
|
|
|
let now = now_msec();
|
|
|
|
if now >= wait_until {
|
|
|
|
self.handle_cmd(command).await;
|
|
|
|
return WorkerState::Busy;
|
|
|
|
}
|
|
|
|
let delay = Duration::from_millis(wait_until - now);
|
|
|
|
select! {
|
|
|
|
_ = tokio::time::sleep(delay) => self.handle_cmd(command).await,
|
|
|
|
cmd = self.rx_cmd.recv() => if let Some(cmd) = cmd {
|
|
|
|
self.handle_cmd(cmd).await;
|
|
|
|
} else {
|
|
|
|
return WorkerState::Done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
match &self.work {
|
2023-09-05 15:37:45 +00:00
|
|
|
ScrubWorkerState::Running { .. } => WorkerState::Busy,
|
2022-07-08 11:30:26 +00:00
|
|
|
_ => WorkerState::Idle,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-07 11:44:11 +00:00
|
|
|
// ---- ---- ----
|
|
|
|
// THIRD KIND OF REPAIR: REBALANCING DATA BLOCKS
|
|
|
|
// between multiple storage locations.
|
|
|
|
// This is a one-shot repair operation that can be launched,
|
|
|
|
// checks everything, and then exits.
|
|
|
|
// ---- ---- ----
|
|
|
|
|
|
|
|
pub struct RebalanceWorker {
|
|
|
|
manager: Arc<BlockManager>,
|
|
|
|
block_iter: BlockStoreIterator,
|
2023-09-07 13:30:56 +00:00
|
|
|
t_started: u64,
|
|
|
|
t_finished: Option<u64>,
|
2023-09-07 11:44:11 +00:00
|
|
|
moved: usize,
|
2023-09-07 13:30:56 +00:00
|
|
|
moved_bytes: u64,
|
2023-09-07 11:44:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl RebalanceWorker {
|
|
|
|
pub fn new(manager: Arc<BlockManager>) -> Self {
|
|
|
|
let block_iter = BlockStoreIterator::new(&manager);
|
|
|
|
Self {
|
|
|
|
manager,
|
|
|
|
block_iter,
|
2023-09-07 13:30:56 +00:00
|
|
|
t_started: now_msec(),
|
|
|
|
t_finished: None,
|
2023-09-07 11:44:11 +00:00
|
|
|
moved: 0,
|
|
|
|
moved_bytes: 0,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[async_trait]
|
|
|
|
impl Worker for RebalanceWorker {
|
|
|
|
fn name(&self) -> String {
|
|
|
|
"Block rebalance worker".into()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn status(&self) -> WorkerStatus {
|
2023-09-07 13:30:56 +00:00
|
|
|
let t_cur = self.t_finished.unwrap_or_else(|| now_msec());
|
|
|
|
let rate = self.moved_bytes / std::cmp::max(1, (t_cur - self.t_started) / 1000);
|
2023-09-07 14:04:03 +00:00
|
|
|
let mut freeform = vec![
|
|
|
|
format!("Blocks moved: {}", self.moved),
|
|
|
|
format!(
|
|
|
|
"Bytes moved: {} ({}/s)",
|
|
|
|
bytesize::ByteSize::b(self.moved_bytes),
|
|
|
|
bytesize::ByteSize::b(rate)
|
|
|
|
),
|
|
|
|
format!("Started: {}", msec_to_rfc3339(self.t_started)),
|
|
|
|
];
|
|
|
|
if let Some(t_fin) = self.t_finished {
|
|
|
|
freeform.push(format!("Finished: {}", msec_to_rfc3339(t_fin)))
|
|
|
|
}
|
2023-09-07 11:44:11 +00:00
|
|
|
WorkerStatus {
|
|
|
|
progress: Some(format!("{:.2}%", self.block_iter.progress() * 100.)),
|
2023-09-07 14:04:03 +00:00
|
|
|
freeform,
|
2023-09-07 11:44:11 +00:00
|
|
|
..Default::default()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
|
|
|
if let Some((path, hash)) = self.block_iter.next().await? {
|
|
|
|
let prim_loc = self.manager.data_layout.load().primary_block_dir(&hash);
|
2023-09-07 14:04:03 +00:00
|
|
|
if path.ancestors().all(|x| x != prim_loc) {
|
|
|
|
let block_path = match path.extension() {
|
|
|
|
None => DataBlockPath::Plain(path.clone()),
|
|
|
|
Some(x) if x.to_str() == Some("zst") => DataBlockPath::Compressed(path.clone()),
|
2023-09-07 13:30:56 +00:00
|
|
|
_ => {
|
|
|
|
warn!("not rebalancing file: {}", path.to_string_lossy());
|
|
|
|
return Ok(WorkerState::Busy);
|
|
|
|
}
|
|
|
|
};
|
2023-09-07 11:44:11 +00:00
|
|
|
// block is not in its primary location,
|
|
|
|
// move it there (reading and re-writing does the trick)
|
2023-09-07 14:04:03 +00:00
|
|
|
debug!("rebalance: moving block {:?} => {:?}", block_path, prim_loc);
|
|
|
|
let block_len = self.manager.fix_block_location(&hash, block_path).await?;
|
2023-09-07 11:44:11 +00:00
|
|
|
self.moved += 1;
|
2023-09-07 14:04:03 +00:00
|
|
|
self.moved_bytes += block_len as u64;
|
2023-09-07 11:44:11 +00:00
|
|
|
}
|
|
|
|
Ok(WorkerState::Busy)
|
|
|
|
} else {
|
|
|
|
// all blocks are in their primary location:
|
|
|
|
// - the ones we moved now are
|
|
|
|
// - the ones written in the meantime always were, because we only
|
|
|
|
// write to primary locations
|
|
|
|
// so we can safely remove all secondary locations from the data layout
|
|
|
|
let new_layout = self
|
|
|
|
.manager
|
|
|
|
.data_layout
|
|
|
|
.load_full()
|
|
|
|
.without_secondary_locations();
|
|
|
|
self.manager
|
|
|
|
.data_layout_persister
|
|
|
|
.save_async(&new_layout)
|
|
|
|
.await?;
|
|
|
|
self.manager.data_layout.store(Arc::new(new_layout));
|
2023-09-07 13:30:56 +00:00
|
|
|
self.t_finished = Some(now_msec());
|
2023-09-07 11:44:11 +00:00
|
|
|
Ok(WorkerState::Done)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn wait_for_work(&mut self) -> WorkerState {
|
|
|
|
unreachable!()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-02 13:34:21 +00:00
|
|
|
// ---- ---- ----
|
|
|
|
// UTILITY FOR ENUMERATING THE BLOCK STORE
|
|
|
|
// ---- ---- ----
|
2022-07-08 11:30:26 +00:00
|
|
|
|
2023-09-04 12:49:49 +00:00
|
|
|
const PROGRESS_FP: u64 = 1_000_000_000;
|
|
|
|
|
2022-07-08 11:30:26 +00:00
|
|
|
impl BlockStoreIterator {
|
|
|
|
fn new(manager: &BlockManager) -> Self {
|
2023-09-07 11:44:11 +00:00
|
|
|
let data_layout = manager.data_layout.load_full();
|
|
|
|
|
2023-09-11 10:28:29 +00:00
|
|
|
let mut dir_cap = vec![0; data_layout.data_dirs.len()];
|
|
|
|
for prim in data_layout.part_prim.iter() {
|
|
|
|
dir_cap[*prim as usize] += 1;
|
|
|
|
}
|
|
|
|
for sec_vec in data_layout.part_sec.iter() {
|
|
|
|
for sec in sec_vec.iter() {
|
|
|
|
dir_cap[*sec as usize] += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
let sum_cap = dir_cap.iter().sum::<usize>() as u64;
|
2023-09-04 12:49:49 +00:00
|
|
|
|
|
|
|
let mut cum_cap = 0;
|
|
|
|
let mut todo = vec![];
|
2023-09-11 10:28:29 +00:00
|
|
|
for (dir, cap) in data_layout.data_dirs.iter().zip(dir_cap.into_iter()) {
|
|
|
|
let progress_min = (cum_cap * PROGRESS_FP) / sum_cap;
|
|
|
|
let progress_max = ((cum_cap + cap as u64) * PROGRESS_FP) / sum_cap;
|
|
|
|
cum_cap += cap as u64;
|
2023-09-04 12:49:49 +00:00
|
|
|
|
|
|
|
todo.push(BsiTodo::Directory {
|
|
|
|
path: dir.path.clone(),
|
|
|
|
progress_min,
|
|
|
|
progress_max,
|
|
|
|
});
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
2023-09-04 12:49:49 +00:00
|
|
|
// entries are processed back-to-front (because of .pop()),
|
|
|
|
// so reverse entries to process them in increasing progress bounds
|
|
|
|
todo.reverse();
|
|
|
|
|
|
|
|
let ret = Self { todo };
|
|
|
|
debug_assert!(ret.progress_invariant());
|
|
|
|
|
|
|
|
ret
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns progress done, between 0 and 1
|
|
|
|
fn progress(&self) -> f32 {
|
2023-09-04 12:49:49 +00:00
|
|
|
self.todo
|
|
|
|
.last()
|
|
|
|
.map(|x| match x {
|
|
|
|
BsiTodo::Directory { progress_min, .. } => *progress_min,
|
|
|
|
BsiTodo::File { progress, .. } => *progress,
|
|
|
|
})
|
|
|
|
.map(|x| x as f32 / PROGRESS_FP as f32)
|
|
|
|
.unwrap_or(1.0)
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
|
2023-09-04 12:49:49 +00:00
|
|
|
async fn next(&mut self) -> Result<Option<(PathBuf, Hash)>, Error> {
|
2022-07-08 11:30:26 +00:00
|
|
|
loop {
|
2023-09-04 12:49:49 +00:00
|
|
|
match self.todo.pop() {
|
2022-07-08 11:30:26 +00:00
|
|
|
None => return Ok(None),
|
2023-09-04 12:49:49 +00:00
|
|
|
Some(BsiTodo::Directory {
|
|
|
|
path,
|
|
|
|
progress_min,
|
|
|
|
progress_max,
|
|
|
|
}) => {
|
|
|
|
let istart = self.todo.len();
|
|
|
|
|
|
|
|
let mut reader = fs::read_dir(&path).await?;
|
|
|
|
while let Some(ent) = reader.next_entry().await? {
|
|
|
|
let name = if let Ok(n) = ent.file_name().into_string() {
|
|
|
|
n
|
|
|
|
} else {
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
let ft = ent.file_type().await?;
|
|
|
|
if ft.is_dir() && hex::decode(&name).is_ok() {
|
|
|
|
self.todo.push(BsiTodo::Directory {
|
|
|
|
path: ent.path(),
|
|
|
|
progress_min: 0,
|
|
|
|
progress_max: 0,
|
|
|
|
});
|
|
|
|
} else if ft.is_file() {
|
2023-09-05 15:37:45 +00:00
|
|
|
let filename = name.split_once('.').map(|(f, _)| f).unwrap_or(&name);
|
|
|
|
if filename.len() == 64 {
|
|
|
|
if let Ok(h) = hex::decode(filename) {
|
|
|
|
let mut hash = [0u8; 32];
|
|
|
|
hash.copy_from_slice(&h);
|
|
|
|
self.todo.push(BsiTodo::File {
|
|
|
|
path: ent.path(),
|
|
|
|
hash: hash.into(),
|
|
|
|
progress: 0,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
2023-09-04 12:49:49 +00:00
|
|
|
}
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
|
2023-09-04 12:49:49 +00:00
|
|
|
let count = self.todo.len() - istart;
|
|
|
|
for (i, ent) in self.todo[istart..].iter_mut().enumerate() {
|
|
|
|
let p1 = progress_min
|
|
|
|
+ ((progress_max - progress_min) * i as u64) / count as u64;
|
|
|
|
let p2 = progress_min
|
|
|
|
+ ((progress_max - progress_min) * (i + 1) as u64) / count as u64;
|
|
|
|
match ent {
|
|
|
|
BsiTodo::Directory {
|
|
|
|
progress_min,
|
|
|
|
progress_max,
|
|
|
|
..
|
|
|
|
} => {
|
|
|
|
*progress_min = p1;
|
|
|
|
*progress_max = p2;
|
|
|
|
}
|
|
|
|
BsiTodo::File { progress, .. } => {
|
|
|
|
*progress = p1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
self.todo[istart..].reverse();
|
|
|
|
debug_assert!(self.progress_invariant());
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
2023-09-05 15:37:45 +00:00
|
|
|
Some(BsiTodo::File { path, hash, .. }) => {
|
|
|
|
return Ok(Some((path, hash)));
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-09-04 12:49:49 +00:00
|
|
|
|
2023-09-05 15:37:45 +00:00
|
|
|
// for debug_assert!
|
2023-09-04 12:49:49 +00:00
|
|
|
fn progress_invariant(&self) -> bool {
|
|
|
|
let iter = self.todo.iter().map(|x| match x {
|
|
|
|
BsiTodo::Directory { progress_min, .. } => progress_min,
|
|
|
|
BsiTodo::File { progress, .. } => progress,
|
|
|
|
});
|
|
|
|
let iter_1 = iter.clone().skip(1);
|
|
|
|
iter.zip(iter_1).all(|(prev, next)| prev >= next)
|
|
|
|
}
|
2022-07-08 11:30:26 +00:00
|
|
|
}
|