block/repair.rs: Added a random element of 10 days to SCRUB_INTERVAL #516

Merged
lx merged 2 commits from jpds/garage:scrub-randomize-window into main 2023-03-06 14:11:25 +00:00
Showing only changes of commit 53d09eb00f - Show all commits

View file

@ -4,6 +4,7 @@ use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use async_trait::async_trait; use async_trait::async_trait;
use rand::Rng;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::fs; use tokio::fs;
use tokio::select; use tokio::select;
@ -19,8 +20,8 @@ use garage_util::tranquilizer::Tranquilizer;
use crate::manager::*; use crate::manager::*;
// Full scrub every 30 days // Full scrub every 25 days with a random element of 10 days mixed in below
const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30); const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 25);
// Scrub tranquility is initially set to 4, but can be changed in the CLI // Scrub tranquility is initially set to 4, but can be changed in the CLI
// and the updated version is persisted over Garage restarts // and the updated version is persisted over Garage restarts
const INITIAL_SCRUB_TRANQUILITY: u32 = 4; const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
@ -175,13 +176,30 @@ pub struct ScrubWorker {
pub struct ScrubWorkerPersisted { pub struct ScrubWorkerPersisted {
pub tranquility: u32, pub tranquility: u32,
pub(crate) time_last_complete_scrub: u64, pub(crate) time_last_complete_scrub: u64,
pub(crate) time_next_run_scrub: u64,
pub(crate) corruptions_detected: u64, pub(crate) corruptions_detected: u64,
} }
fn randomize_next_scrub_run_time() -> u64 {
jpds marked this conversation as resolved Outdated
Outdated
Review

the name of this function should contain the word scrub, maybe randomize_next_scrub_run_time

the name of this function should contain the word `scrub`, maybe `randomize_next_scrub_run_time`
// Take SCRUB_INTERVAL and mix in a random interval of 10 days to attempt to
// balance scrub load across different cluster nodes.
let next_run_timestamp = now_msec()
+ SCRUB_INTERVAL
.saturating_add(Duration::from_secs(
rand::thread_rng().gen_range(0..3600 * 24 * 10),
))
.as_millis() as u64;
next_run_timestamp
}
impl garage_util::migrate::InitialFormat for ScrubWorkerPersisted {} impl garage_util::migrate::InitialFormat for ScrubWorkerPersisted {}
impl Default for ScrubWorkerPersisted { impl Default for ScrubWorkerPersisted {
fn default() -> Self { fn default() -> Self {
ScrubWorkerPersisted { ScrubWorkerPersisted {
time_last_complete_scrub: 0, time_last_complete_scrub: 0,
time_next_run_scrub: randomize_next_scrub_run_time(),
tranquility: INITIAL_SCRUB_TRANQUILITY, tranquility: INITIAL_SCRUB_TRANQUILITY,
corruptions_detected: 0, corruptions_detected: 0,
} }
@ -279,12 +297,13 @@ impl Worker for ScrubWorker {
} }
fn status(&self) -> WorkerStatus { fn status(&self) -> WorkerStatus {
let (corruptions_detected, tranquility, time_last_complete_scrub) = let (corruptions_detected, tranquility, time_last_complete_scrub, time_next_run_scrub) =
self.persister.get_with(|p| { self.persister.get_with(|p| {
( (
p.corruptions_detected, p.corruptions_detected,
p.tranquility, p.tranquility,
p.time_last_complete_scrub, p.time_last_complete_scrub,
p.time_next_run_scrub,
) )
}); });
@ -302,10 +321,16 @@ impl Worker for ScrubWorker {
s.freeform = vec![format!("Scrub paused, resumes at {}", msec_to_rfc3339(*rt))]; s.freeform = vec![format!("Scrub paused, resumes at {}", msec_to_rfc3339(*rt))];
} }
ScrubWorkerState::Finished => { ScrubWorkerState::Finished => {
s.freeform = vec![format!( s.freeform = vec![
format!(
"Last scrub completed at {}", "Last scrub completed at {}",
msec_to_rfc3339(time_last_complete_scrub) msec_to_rfc3339(time_last_complete_scrub),
)]; ),
format!(
"Next scrub scheduled for {}",
msec_to_rfc3339(time_next_run_scrub)
),
];
} }
} }
s s
@ -334,8 +359,10 @@ impl Worker for ScrubWorker {
.tranquilizer .tranquilizer
.tranquilize_worker(self.persister.get_with(|p| p.tranquility))) .tranquilize_worker(self.persister.get_with(|p| p.tranquility)))
} else { } else {
self.persister self.persister.set_with(|p| {
.set_with(|p| p.time_last_complete_scrub = now_msec())?; p.time_last_complete_scrub = now_msec();
p.time_next_run_scrub = randomize_next_scrub_run_time();
})?;
jpds marked this conversation as resolved Outdated
Outdated
Review

Probably better to concatenate those two calls in one:

self.persister.set_with(|p| 
  p.time_last_complete_scrub = now_msec();
  p.time_next_run_scrub = randomize_next_run_time();
})?

set_with does non-async (possibly blocking) IO to write the persistent data to disk, better do it only once if possible

Probably better to concatenate those two calls in one: ```rust self.persister.set_with(|p| p.time_last_complete_scrub = now_msec(); p.time_next_run_scrub = randomize_next_run_time(); })? ``` `set_with` does non-async (possibly blocking) IO to write the persistent data to disk, better do it only once if possible
self.work = ScrubWorkerState::Finished; self.work = ScrubWorkerState::Finished;
self.tranquilizer.clear(); self.tranquilizer.clear();
Ok(WorkerState::Idle) Ok(WorkerState::Idle)
@ -350,8 +377,7 @@ impl Worker for ScrubWorker {
ScrubWorkerState::Running(_) => return WorkerState::Busy, ScrubWorkerState::Running(_) => return WorkerState::Busy,
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume), ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
ScrubWorkerState::Finished => ( ScrubWorkerState::Finished => (
self.persister.get_with(|p| p.time_last_complete_scrub) self.persister.get_with(|p| p.time_next_run_scrub),
+ SCRUB_INTERVAL.as_millis() as u64,
ScrubWorkerCommand::Start, ScrubWorkerCommand::Start,
), ),
}; };