forked from Deuxfleurs/garage
Merge pull request 'block/repair.rs: Added a random element of 10 days to SCRUB_INTERVAL' (#516) from jpds/garage:scrub-randomize-window into main
Reviewed-on: Deuxfleurs/garage#516
This commit is contained in:
commit
2dc80abbb1
2 changed files with 40 additions and 11 deletions
|
@ -185,6 +185,9 @@ impl BlockManager {
|
||||||
vars.register_ro(&self.scrub_persister, "scrub-last-completed", |p| {
|
vars.register_ro(&self.scrub_persister, "scrub-last-completed", |p| {
|
||||||
p.get_with(|x| msec_to_rfc3339(x.time_last_complete_scrub))
|
p.get_with(|x| msec_to_rfc3339(x.time_last_complete_scrub))
|
||||||
});
|
});
|
||||||
|
vars.register_ro(&self.scrub_persister, "scrub-next-run", |p| {
|
||||||
|
p.get_with(|x| msec_to_rfc3339(x.time_next_run_scrub))
|
||||||
|
});
|
||||||
vars.register_ro(&self.scrub_persister, "scrub-corruptions_detected", |p| {
|
vars.register_ro(&self.scrub_persister, "scrub-corruptions_detected", |p| {
|
||||||
p.get_with(|x| x.corruptions_detected)
|
p.get_with(|x| x.corruptions_detected)
|
||||||
});
|
});
|
||||||
|
|
|
@ -4,6 +4,7 @@ use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use rand::Rng;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
|
@ -19,8 +20,8 @@ use garage_util::tranquilizer::Tranquilizer;
|
||||||
|
|
||||||
use crate::manager::*;
|
use crate::manager::*;
|
||||||
|
|
||||||
// Full scrub every 30 days
|
// Full scrub every 25 days with a random element of 10 days mixed in below
|
||||||
const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30);
|
const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 25);
|
||||||
// Scrub tranquility is initially set to 4, but can be changed in the CLI
|
// Scrub tranquility is initially set to 4, but can be changed in the CLI
|
||||||
// and the updated version is persisted over Garage restarts
|
// and the updated version is persisted over Garage restarts
|
||||||
const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
|
const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
|
||||||
|
@ -175,13 +176,30 @@ pub struct ScrubWorker {
|
||||||
pub struct ScrubWorkerPersisted {
|
pub struct ScrubWorkerPersisted {
|
||||||
pub tranquility: u32,
|
pub tranquility: u32,
|
||||||
pub(crate) time_last_complete_scrub: u64,
|
pub(crate) time_last_complete_scrub: u64,
|
||||||
|
pub(crate) time_next_run_scrub: u64,
|
||||||
pub(crate) corruptions_detected: u64,
|
pub(crate) corruptions_detected: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn randomize_next_scrub_run_time() -> u64 {
|
||||||
|
// Take SCRUB_INTERVAL and mix in a random interval of 10 days to attempt to
|
||||||
|
// balance scrub load across different cluster nodes.
|
||||||
|
|
||||||
|
let next_run_timestamp = now_msec()
|
||||||
|
+ SCRUB_INTERVAL
|
||||||
|
.saturating_add(Duration::from_secs(
|
||||||
|
rand::thread_rng().gen_range(0..3600 * 24 * 10),
|
||||||
|
))
|
||||||
|
.as_millis() as u64;
|
||||||
|
|
||||||
|
next_run_timestamp
|
||||||
|
}
|
||||||
|
|
||||||
impl garage_util::migrate::InitialFormat for ScrubWorkerPersisted {}
|
impl garage_util::migrate::InitialFormat for ScrubWorkerPersisted {}
|
||||||
impl Default for ScrubWorkerPersisted {
|
impl Default for ScrubWorkerPersisted {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
ScrubWorkerPersisted {
|
ScrubWorkerPersisted {
|
||||||
time_last_complete_scrub: 0,
|
time_last_complete_scrub: 0,
|
||||||
|
time_next_run_scrub: randomize_next_scrub_run_time(),
|
||||||
tranquility: INITIAL_SCRUB_TRANQUILITY,
|
tranquility: INITIAL_SCRUB_TRANQUILITY,
|
||||||
corruptions_detected: 0,
|
corruptions_detected: 0,
|
||||||
}
|
}
|
||||||
|
@ -279,12 +297,13 @@ impl Worker for ScrubWorker {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn status(&self) -> WorkerStatus {
|
fn status(&self) -> WorkerStatus {
|
||||||
let (corruptions_detected, tranquility, time_last_complete_scrub) =
|
let (corruptions_detected, tranquility, time_last_complete_scrub, time_next_run_scrub) =
|
||||||
self.persister.get_with(|p| {
|
self.persister.get_with(|p| {
|
||||||
(
|
(
|
||||||
p.corruptions_detected,
|
p.corruptions_detected,
|
||||||
p.tranquility,
|
p.tranquility,
|
||||||
p.time_last_complete_scrub,
|
p.time_last_complete_scrub,
|
||||||
|
p.time_next_run_scrub,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -302,10 +321,16 @@ impl Worker for ScrubWorker {
|
||||||
s.freeform = vec![format!("Scrub paused, resumes at {}", msec_to_rfc3339(*rt))];
|
s.freeform = vec![format!("Scrub paused, resumes at {}", msec_to_rfc3339(*rt))];
|
||||||
}
|
}
|
||||||
ScrubWorkerState::Finished => {
|
ScrubWorkerState::Finished => {
|
||||||
s.freeform = vec![format!(
|
s.freeform = vec![
|
||||||
|
format!(
|
||||||
"Last scrub completed at {}",
|
"Last scrub completed at {}",
|
||||||
msec_to_rfc3339(time_last_complete_scrub)
|
msec_to_rfc3339(time_last_complete_scrub),
|
||||||
)];
|
),
|
||||||
|
format!(
|
||||||
|
"Next scrub scheduled for {}",
|
||||||
|
msec_to_rfc3339(time_next_run_scrub)
|
||||||
|
),
|
||||||
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s
|
s
|
||||||
|
@ -334,8 +359,10 @@ impl Worker for ScrubWorker {
|
||||||
.tranquilizer
|
.tranquilizer
|
||||||
.tranquilize_worker(self.persister.get_with(|p| p.tranquility)))
|
.tranquilize_worker(self.persister.get_with(|p| p.tranquility)))
|
||||||
} else {
|
} else {
|
||||||
self.persister
|
self.persister.set_with(|p| {
|
||||||
.set_with(|p| p.time_last_complete_scrub = now_msec())?;
|
p.time_last_complete_scrub = now_msec();
|
||||||
|
p.time_next_run_scrub = randomize_next_scrub_run_time();
|
||||||
|
})?;
|
||||||
self.work = ScrubWorkerState::Finished;
|
self.work = ScrubWorkerState::Finished;
|
||||||
self.tranquilizer.clear();
|
self.tranquilizer.clear();
|
||||||
Ok(WorkerState::Idle)
|
Ok(WorkerState::Idle)
|
||||||
|
@ -350,8 +377,7 @@ impl Worker for ScrubWorker {
|
||||||
ScrubWorkerState::Running(_) => return WorkerState::Busy,
|
ScrubWorkerState::Running(_) => return WorkerState::Busy,
|
||||||
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
|
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
|
||||||
ScrubWorkerState::Finished => (
|
ScrubWorkerState::Finished => (
|
||||||
self.persister.get_with(|p| p.time_last_complete_scrub)
|
self.persister.get_with(|p| p.time_next_run_scrub),
|
||||||
+ SCRUB_INTERVAL.as_millis() as u64,
|
|
||||||
ScrubWorkerCommand::Start,
|
ScrubWorkerCommand::Start,
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue