forked from Deuxfleurs/garage
Add tranquilizer mechanism to improve on token bucket mechanism
This commit is contained in:
parent
6f13d083ab
commit
2090a6187f
5 changed files with 95 additions and 30 deletions
|
@ -285,9 +285,9 @@ pub enum RepairWhat {
|
||||||
/// Verify integrity of all blocks on disc (extremely slow, i/o intensive)
|
/// Verify integrity of all blocks on disc (extremely slow, i/o intensive)
|
||||||
#[structopt(name = "scrub")]
|
#[structopt(name = "scrub")]
|
||||||
Scrub {
|
Scrub {
|
||||||
/// Limit on i/o speed, in B/s
|
/// Tranquility factor (see tranquilizer documentation)
|
||||||
#[structopt(name = "limit")]
|
#[structopt(name = "tranquility", default_value = "2")]
|
||||||
limit: Option<usize>,
|
tranquility: u32,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -51,11 +51,11 @@ impl Repair {
|
||||||
.repair_data_store(&must_exit)
|
.repair_data_store(&must_exit)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
RepairWhat::Scrub { limit } => {
|
RepairWhat::Scrub { tranquility } => {
|
||||||
info!("Verifying integrity of stored blocks");
|
info!("Verifying integrity of stored blocks");
|
||||||
self.garage
|
self.garage
|
||||||
.block_manager
|
.block_manager
|
||||||
.scrub_data_store(&must_exit, limit)
|
.scrub_data_store(&must_exit, tranquility)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ use tokio::sync::{watch, Mutex, Notify};
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
use garage_util::time::*;
|
use garage_util::time::*;
|
||||||
use garage_util::token_bucket::TokenBucket;
|
use garage_util::tranquilizer::Tranquilizer;
|
||||||
|
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
use garage_rpc::*;
|
use garage_rpc::*;
|
||||||
|
@ -29,6 +29,7 @@ use crate::garage::Garage;
|
||||||
pub const INLINE_THRESHOLD: usize = 3072;
|
pub const INLINE_THRESHOLD: usize = 3072;
|
||||||
|
|
||||||
pub const BACKGROUND_WORKERS: u64 = 1;
|
pub const BACKGROUND_WORKERS: u64 = 1;
|
||||||
|
pub const BACKGROUND_TRANQUILITY: u32 = 3;
|
||||||
|
|
||||||
const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(42);
|
const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(42);
|
||||||
const BLOCK_GC_TIMEOUT: Duration = Duration::from_secs(60);
|
const BLOCK_GC_TIMEOUT: Duration = Duration::from_secs(60);
|
||||||
|
@ -214,24 +215,15 @@ impl BlockManager {
|
||||||
pub async fn scrub_data_store(
|
pub async fn scrub_data_store(
|
||||||
&self,
|
&self,
|
||||||
must_exit: &watch::Receiver<bool>,
|
must_exit: &watch::Receiver<bool>,
|
||||||
speed_limit: Option<usize>,
|
tranquility: u32,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let token_bucket = speed_limit.map(|rate| TokenBucket::new(rate as u64));
|
let tranquilizer = Tranquilizer::new(30);
|
||||||
self.for_each_file(
|
self.for_each_file(
|
||||||
token_bucket,
|
tranquilizer,
|
||||||
move |mut token_bucket, hash| {
|
move |mut tranquilizer, hash| async move {
|
||||||
async move {
|
let _ = self.read_block(&hash).await;
|
||||||
let len = match self.read_block(&hash).await {
|
tranquilizer.tranquilize(tranquility).await;
|
||||||
Ok(BlockRpc::PutBlock(PutBlockMessage { data, .. })) => data.len(),
|
Ok(tranquilizer)
|
||||||
Ok(_) => unreachable!(),
|
|
||||||
Err(_) => 0, // resync and warn message made by read_block if necessary
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(tb) = &mut token_bucket {
|
|
||||||
tb.take(len as u64).await;
|
|
||||||
}
|
|
||||||
Ok(token_bucket)
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
must_exit,
|
must_exit,
|
||||||
)
|
)
|
||||||
|
@ -381,18 +373,32 @@ impl BlockManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn resync_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
|
async fn resync_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
|
||||||
|
let mut tranquilizer = Tranquilizer::new(30);
|
||||||
|
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
if let Err(e) = self.resync_iter(&mut must_exit).await {
|
match self.resync_iter(&mut must_exit).await {
|
||||||
warn!("Error in block resync loop: {}", e);
|
Ok(true) => {
|
||||||
select! {
|
tranquilizer.tranquilize(BACKGROUND_TRANQUILITY).await;
|
||||||
_ = tokio::time::sleep(Duration::from_secs(1)).fuse() => {},
|
}
|
||||||
_ = must_exit.changed().fuse() => {},
|
Ok(false) => {
|
||||||
|
tranquilizer.reset();
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// The errors that we have here are only Sled errors
|
||||||
|
// We don't really know how to handle them so just ¯\_(ツ)_/¯
|
||||||
|
// (there is kind of an assumption that Sled won't error on us,
|
||||||
|
// if it does there is not much we can do -- TODO should we just panic?)
|
||||||
|
error!(
|
||||||
|
"Could not do a resync iteration: {} (this is a very bad error)",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
tranquilizer.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<(), Error> {
|
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<bool, Error> {
|
||||||
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? {
|
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? {
|
||||||
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
|
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
|
||||||
let now = now_msec();
|
let now = now_msec();
|
||||||
|
@ -403,7 +409,7 @@ impl BlockManager {
|
||||||
warn!("Error when resyncing {:?}: {}", hash, e);
|
warn!("Error when resyncing {:?}: {}", hash, e);
|
||||||
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
|
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
|
||||||
}
|
}
|
||||||
res?; // propagate error to delay main loop
|
Ok(true)
|
||||||
} else {
|
} else {
|
||||||
self.resync_queue.insert(time_bytes, hash_bytes)?;
|
self.resync_queue.insert(time_bytes, hash_bytes)?;
|
||||||
let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
|
let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
|
||||||
|
@ -412,14 +418,15 @@ impl BlockManager {
|
||||||
_ = self.resync_notify.notified().fuse() => {},
|
_ = self.resync_notify.notified().fuse() => {},
|
||||||
_ = must_exit.changed().fuse() => {},
|
_ = must_exit.changed().fuse() => {},
|
||||||
}
|
}
|
||||||
|
Ok(false)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
select! {
|
select! {
|
||||||
_ = self.resync_notify.notified().fuse() => {},
|
_ = self.resync_notify.notified().fuse() => {},
|
||||||
_ = must_exit.changed().fuse() => {},
|
_ = must_exit.changed().fuse() => {},
|
||||||
}
|
}
|
||||||
|
Ok(false)
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
|
async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
|
||||||
|
|
|
@ -10,3 +10,4 @@ pub mod error;
|
||||||
pub mod persister;
|
pub mod persister;
|
||||||
pub mod time;
|
pub mod time;
|
||||||
pub mod token_bucket;
|
pub mod token_bucket;
|
||||||
|
pub mod tranquilizer;
|
||||||
|
|
57
src/util/tranquilizer.rs
Normal file
57
src/util/tranquilizer.rs
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use tokio::time::sleep;
|
||||||
|
|
||||||
|
/// A tranquilizer is a helper object that is used to make
|
||||||
|
/// background operations not take up too much time.
|
||||||
|
///
|
||||||
|
/// Background operations are done in a loop that does the following:
|
||||||
|
/// - do one step of the background process
|
||||||
|
/// - tranquilize, i.e. wait some time to not overload the system
|
||||||
|
///
|
||||||
|
/// The tranquilizer observes how long the steps take, and keeps
|
||||||
|
/// in memory a number of observations. The tranquilize operation
|
||||||
|
/// simply sleeps k * avg(observed step times), where k is
|
||||||
|
/// the tranquility factor. For instance with a tranquility of 2,
|
||||||
|
/// the tranquilizer will sleep on average 2 units of time for every
|
||||||
|
/// 1 unit of time spent doing the background task.
|
||||||
|
pub struct Tranquilizer {
|
||||||
|
n_observations: usize,
|
||||||
|
observations: VecDeque<Duration>,
|
||||||
|
sum_observations: Duration,
|
||||||
|
last_step_begin: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Tranquilizer {
|
||||||
|
pub fn new(n_observations: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
n_observations,
|
||||||
|
observations: VecDeque::with_capacity(n_observations + 1),
|
||||||
|
sum_observations: Duration::ZERO,
|
||||||
|
last_step_begin: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn tranquilize(&mut self, tranquility: u32) {
|
||||||
|
let observation = Instant::now() - self.last_step_begin;
|
||||||
|
|
||||||
|
self.observations.push_back(observation);
|
||||||
|
self.sum_observations += observation;
|
||||||
|
|
||||||
|
while self.observations.len() > self.n_observations {
|
||||||
|
self.sum_observations -= self.observations.pop_front().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.observations.is_empty() {
|
||||||
|
let delay = (tranquility * self.sum_observations) / (self.observations.len() as u32);
|
||||||
|
sleep(delay).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset(&mut self) {
|
||||||
|
self.last_step_begin = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue