forked from Deuxfleurs/garage
Refactor block resync loop; make workers infaillible
This commit is contained in:
parent
667e4e72a8
commit
4d4117f2b4
7 changed files with 49 additions and 47 deletions
|
@ -20,6 +20,16 @@ impl Repair {
|
||||||
&self,
|
&self,
|
||||||
opt: RepairOpt,
|
opt: RepairOpt,
|
||||||
must_exit: watch::Receiver<bool>,
|
must_exit: watch::Receiver<bool>,
|
||||||
|
) {
|
||||||
|
if let Err(e) = self.repair_worker_aux(opt, must_exit).await {
|
||||||
|
warn!("Repair worker failed with error: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn repair_worker_aux(
|
||||||
|
&self,
|
||||||
|
opt: RepairOpt,
|
||||||
|
must_exit: watch::Receiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true);
|
let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true);
|
||||||
|
|
||||||
|
|
|
@ -258,28 +258,31 @@ impl BlockManager {
|
||||||
async fn resync_loop(
|
async fn resync_loop(
|
||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
mut must_exit: watch::Receiver<bool>,
|
mut must_exit: watch::Receiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) {
|
||||||
let mut n_failures = 0usize;
|
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? {
|
if let Err(e) = self.resync_iter(&mut must_exit).await {
|
||||||
|
warn!("Error in block resync loop: {}", e);
|
||||||
|
tokio::time::delay_for(Duration::from_secs(10)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<(), Error> {
|
||||||
|
if let Some(first_item) = self.resync_queue.iter().next() {
|
||||||
|
let (time_bytes, hash_bytes) = first_item?;
|
||||||
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
|
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
|
||||||
let now = now_msec();
|
let now = now_msec();
|
||||||
if now >= time_msec {
|
if now >= time_msec {
|
||||||
let hash = Hash::try_from(&hash_bytes[..]).unwrap();
|
let hash = Hash::try_from(&hash_bytes[..]).unwrap();
|
||||||
|
let res = self.resync_block(&hash).await;
|
||||||
if let Err(e) = self.resync_iter(&hash).await {
|
if let Err(e) = &res {
|
||||||
warn!("Failed to resync block {:?}, retrying later: {}", hash, e);
|
warn!("Error when resyncing {:?}: {}", hash, e);
|
||||||
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
|
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
|
||||||
n_failures += 1;
|
|
||||||
if n_failures >= 10 {
|
|
||||||
warn!("Too many resync failures, throttling.");
|
|
||||||
tokio::time::delay_for(Duration::from_secs(1)).await;
|
|
||||||
}
|
}
|
||||||
|
self.resync_queue.remove(&time_bytes)?;
|
||||||
|
res?; // propagate error to delay main loop
|
||||||
} else {
|
} else {
|
||||||
n_failures = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
self.resync_queue.insert(time_bytes, hash_bytes)?;
|
|
||||||
let delay = tokio::time::delay_for(Duration::from_millis(time_msec - now));
|
let delay = tokio::time::delay_for(Duration::from_millis(time_msec - now));
|
||||||
select! {
|
select! {
|
||||||
_ = delay.fuse() => (),
|
_ = delay.fuse() => (),
|
||||||
|
@ -293,11 +296,10 @@ impl BlockManager {
|
||||||
_ = must_exit.recv().fuse() => (),
|
_ = must_exit.recv().fuse() => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn resync_iter(&self, hash: &Hash) -> Result<(), Error> {
|
async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
|
||||||
let lock = self.data_dir_lock.lock().await;
|
let lock = self.data_dir_lock.lock().await;
|
||||||
|
|
||||||
let path = self.block_path(hash);
|
let path = self.block_path(hash);
|
||||||
|
|
|
@ -318,9 +318,7 @@ impl System {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
self.clone()
|
self.clone()
|
||||||
.background
|
.background
|
||||||
.spawn_worker(format!("ping loop"), |stop_signal| {
|
.spawn_worker(format!("ping loop"), |stop_signal| self2.ping_loop(stop_signal));
|
||||||
self2.ping_loop(stop_signal).map(Ok)
|
|
||||||
});
|
|
||||||
|
|
||||||
if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) {
|
if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
|
@ -329,7 +327,6 @@ impl System {
|
||||||
.spawn_worker(format!("Consul loop"), |stop_signal| {
|
.spawn_worker(format!("Consul loop"), |stop_signal| {
|
||||||
self2
|
self2
|
||||||
.consul_loop(stop_signal, consul_host, consul_service_name)
|
.consul_loop(stop_signal, consul_host, consul_service_name)
|
||||||
.map(Ok)
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,7 +70,7 @@ where
|
||||||
gc
|
gc
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) -> Result<(), Error> {
|
async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
match self.gc_loop_iter().await {
|
match self.gc_loop_iter().await {
|
||||||
Ok(true) => {
|
Ok(true) => {
|
||||||
|
@ -89,7 +89,6 @@ where
|
||||||
_ = must_exit.recv().fuse() => (),
|
_ = must_exit.recv().fuse() => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn gc_loop_iter(&self) -> Result<bool, Error> {
|
async fn gc_loop_iter(&self) -> Result<bool, Error> {
|
||||||
|
|
|
@ -104,7 +104,7 @@ impl MerkleUpdater {
|
||||||
async fn updater_loop(
|
async fn updater_loop(
|
||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
mut must_exit: watch::Receiver<bool>,
|
mut must_exit: watch::Receiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) {
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
if let Some(x) = self.todo.iter().next() {
|
if let Some(x) = self.todo.iter().next() {
|
||||||
match x {
|
match x {
|
||||||
|
@ -131,7 +131,6 @@ impl MerkleUpdater {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> {
|
fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> {
|
||||||
|
|
|
@ -136,7 +136,7 @@ where
|
||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
mut must_exit: watch::Receiver<bool>,
|
mut must_exit: watch::Receiver<bool>,
|
||||||
mut busy_rx: mpsc::UnboundedReceiver<bool>,
|
mut busy_rx: mpsc::UnboundedReceiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) {
|
||||||
let mut prev_ring: Arc<Ring> = self.aux.system.ring.borrow().clone();
|
let mut prev_ring: Arc<Ring> = self.aux.system.ring.borrow().clone();
|
||||||
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.aux.system.ring.clone();
|
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.aux.system.ring.clone();
|
||||||
let mut nothing_to_do_since = Some(Instant::now());
|
let mut nothing_to_do_since = Some(Instant::now());
|
||||||
|
@ -183,7 +183,6 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_full_sync(&self) {
|
pub fn add_full_sync(&self) {
|
||||||
|
@ -197,11 +196,11 @@ where
|
||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
mut must_exit: watch::Receiver<bool>,
|
mut must_exit: watch::Receiver<bool>,
|
||||||
busy_tx: mpsc::UnboundedSender<bool>,
|
busy_tx: mpsc::UnboundedSender<bool>,
|
||||||
) -> Result<(), Error> {
|
) {
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
let task = self.todo.lock().unwrap().pop_task();
|
let task = self.todo.lock().unwrap().pop_task();
|
||||||
if let Some(partition) = task {
|
if let Some(partition) = task {
|
||||||
busy_tx.send(true)?;
|
busy_tx.send(true).unwrap();
|
||||||
let res = self
|
let res = self
|
||||||
.clone()
|
.clone()
|
||||||
.sync_partition(&partition, &mut must_exit)
|
.sync_partition(&partition, &mut must_exit)
|
||||||
|
@ -213,11 +212,10 @@ where
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
busy_tx.send(false)?;
|
busy_tx.send(false).unwrap();
|
||||||
tokio::time::delay_for(Duration::from_secs(1)).await;
|
tokio::time::delay_for(Duration::from_secs(1)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn sync_partition(
|
async fn sync_partition(
|
||||||
|
|
|
@ -76,16 +76,13 @@ impl BackgroundRunner {
|
||||||
pub fn spawn_worker<F, T>(&self, name: String, worker: F)
|
pub fn spawn_worker<F, T>(&self, name: String, worker: F)
|
||||||
where
|
where
|
||||||
F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
|
F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
|
||||||
T: Future<Output = JobOutput> + Send + 'static,
|
T: Future<Output = ()> + Send + 'static,
|
||||||
{
|
{
|
||||||
let mut workers = self.workers.lock().unwrap();
|
let mut workers = self.workers.lock().unwrap();
|
||||||
let stop_signal = self.stop_signal.clone();
|
let stop_signal = self.stop_signal.clone();
|
||||||
workers.push(tokio::spawn(async move {
|
workers.push(tokio::spawn(async move {
|
||||||
if let Err(e) = worker(stop_signal).await {
|
worker(stop_signal).await;
|
||||||
error!("Worker stopped with error: {}, error: {}", name, e);
|
info!("Worker exited: {}", name);
|
||||||
} else {
|
|
||||||
info!("Worker exited successfully: {}", name);
|
|
||||||
}
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue