forked from Deuxfleurs/garage
Simplified and more aggressive worker exit logic
This commit is contained in:
parent
d4af27f920
commit
dfc131850a
8 changed files with 32 additions and 71 deletions
|
@ -148,7 +148,7 @@ impl Worker for RepairWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -341,7 +341,7 @@ impl Worker for ScrubWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
let (wait_until, command) = match &self.work {
|
let (wait_until, command) = match &self.work {
|
||||||
ScrubWorkerState::Running(_) => return WorkerState::Busy,
|
ScrubWorkerState::Running(_) => return WorkerState::Busy,
|
||||||
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
|
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
|
||||||
|
|
|
@ -540,7 +540,7 @@ impl Worker for ResyncWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
while self.index >= self.manager.resync.persisted.load().n_workers {
|
while self.index >= self.manager.resync.persisted.load().n_workers {
|
||||||
self.manager.resync.notify.notified().await
|
self.manager.resync.notify.notified().await
|
||||||
}
|
}
|
||||||
|
|
|
@ -136,7 +136,7 @@ impl Worker for RepairVersionsWorker {
|
||||||
Ok(WorkerState::Busy)
|
Ok(WorkerState::Busy)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -214,7 +214,7 @@ impl Worker for RepairBlockrefsWorker {
|
||||||
Ok(WorkerState::Busy)
|
Ok(WorkerState::Busy)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -348,10 +348,7 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
|
||||||
return WorkerState::Done;
|
|
||||||
}
|
|
||||||
tokio::time::sleep(self.wait_delay).await;
|
tokio::time::sleep(self.wait_delay).await;
|
||||||
WorkerState::Busy
|
WorkerState::Busy
|
||||||
}
|
}
|
||||||
|
|
|
@ -340,10 +340,7 @@ where
|
||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
|
||||||
return WorkerState::Done;
|
|
||||||
}
|
|
||||||
select! {
|
select! {
|
||||||
_ = tokio::time::sleep(Duration::from_secs(60)) => (),
|
_ = tokio::time::sleep(Duration::from_secs(60)) => (),
|
||||||
_ = self.0.data.merkle_todo_notify.notified() => (),
|
_ = self.0.data.merkle_todo_notify.notified() => (),
|
||||||
|
|
|
@ -71,10 +71,7 @@ where
|
||||||
Ok(WorkerState::Busy)
|
Ok(WorkerState::Busy)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
|
||||||
return WorkerState::Done;
|
|
||||||
}
|
|
||||||
select! {
|
select! {
|
||||||
_ = tokio::time::sleep(Duration::from_secs(600)) => (),
|
_ = tokio::time::sleep(Duration::from_secs(600)) => (),
|
||||||
_ = self.0.data.insert_queue_notify.notified() => (),
|
_ = self.0.data.insert_queue_notify.notified() => (),
|
||||||
|
|
|
@ -593,10 +593,7 @@ impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
|
||||||
return WorkerState::Done;
|
|
||||||
}
|
|
||||||
select! {
|
select! {
|
||||||
s = self.add_full_sync_rx.recv() => {
|
s = self.add_full_sync_rx.recv() => {
|
||||||
if let Some(()) = s {
|
if let Some(()) = s {
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use futures::future::*;
|
use futures::future::*;
|
||||||
|
@ -14,6 +14,10 @@ use crate::background::{WorkerInfo, WorkerStatus};
|
||||||
use crate::error::Error;
|
use crate::error::Error;
|
||||||
use crate::time::now_msec;
|
use crate::time::now_msec;
|
||||||
|
|
||||||
|
// All workers that haven't exited for this time after an exit signal was recieved
|
||||||
|
// will be interrupted in the middle of whatever they are doing.
|
||||||
|
const EXIT_DEADLINE: Duration = Duration::from_secs(8);
|
||||||
|
|
||||||
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
|
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
|
||||||
pub enum WorkerState {
|
pub enum WorkerState {
|
||||||
Busy,
|
Busy,
|
||||||
|
@ -50,10 +54,8 @@ pub trait Worker: Send {
|
||||||
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
|
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
|
||||||
|
|
||||||
/// Wait for work: await for some task to become available. This future can be interrupted in
|
/// Wait for work: await for some task to become available. This future can be interrupted in
|
||||||
/// the middle for any reason. This future doesn't have to await on must_exit.changed(), we
|
/// the middle for any reason, for example if an interrupt signal was recieved.
|
||||||
/// are doing it for you. Therefore it only receives a read refernce to must_exit which allows
|
async fn wait_for_work(&mut self) -> WorkerState;
|
||||||
/// it to check if we are exiting.
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct WorkerProcessor {
|
pub(crate) struct WorkerProcessor {
|
||||||
|
@ -93,11 +95,9 @@ impl WorkerProcessor {
|
||||||
let task_id = next_task_id;
|
let task_id = next_task_id;
|
||||||
next_task_id += 1;
|
next_task_id += 1;
|
||||||
let stop_signal = self.stop_signal.clone();
|
let stop_signal = self.stop_signal.clone();
|
||||||
let stop_signal_worker = self.stop_signal.clone();
|
|
||||||
let mut worker = WorkerHandler {
|
let mut worker = WorkerHandler {
|
||||||
task_id,
|
task_id,
|
||||||
stop_signal,
|
stop_signal,
|
||||||
stop_signal_worker,
|
|
||||||
worker: new_worker,
|
worker: new_worker,
|
||||||
state: WorkerState::Busy,
|
state: WorkerState::Busy,
|
||||||
errors: 0,
|
errors: 0,
|
||||||
|
@ -153,26 +153,14 @@ impl WorkerProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
// We are exiting, drain everything
|
// We are exiting, drain everything
|
||||||
let drain_half_time = Instant::now() + Duration::from_secs(5);
|
|
||||||
let drain_everything = async move {
|
let drain_everything = async move {
|
||||||
while let Some(mut worker) = workers.next().await {
|
while let Some(worker) = workers.next().await {
|
||||||
if worker.state == WorkerState::Done {
|
|
||||||
info!(
|
info!(
|
||||||
"Worker {} (TID {}) exited",
|
"Worker {} (TID {}) exited (last state: {:?})",
|
||||||
worker.worker.name(),
|
worker.worker.name(),
|
||||||
worker.task_id
|
worker.task_id,
|
||||||
|
worker.state
|
||||||
);
|
);
|
||||||
} else if Instant::now() > drain_half_time {
|
|
||||||
warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
|
|
||||||
} else {
|
|
||||||
workers.push(
|
|
||||||
async move {
|
|
||||||
worker.step().await;
|
|
||||||
worker
|
|
||||||
}
|
|
||||||
.boxed(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -180,7 +168,7 @@ impl WorkerProcessor {
|
||||||
_ = drain_everything => {
|
_ = drain_everything => {
|
||||||
info!("All workers exited peacefully \\o/");
|
info!("All workers exited peacefully \\o/");
|
||||||
}
|
}
|
||||||
_ = tokio::time::sleep(Duration::from_secs(9)) => {
|
_ = tokio::time::sleep(EXIT_DEADLINE) => {
|
||||||
error!("Some workers could not exit in time, we are cancelling some things in the middle");
|
error!("Some workers could not exit in time, we are cancelling some things in the middle");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -190,7 +178,6 @@ impl WorkerProcessor {
|
||||||
struct WorkerHandler {
|
struct WorkerHandler {
|
||||||
task_id: usize,
|
task_id: usize,
|
||||||
stop_signal: watch::Receiver<bool>,
|
stop_signal: watch::Receiver<bool>,
|
||||||
stop_signal_worker: watch::Receiver<bool>,
|
|
||||||
worker: Box<dyn Worker>,
|
worker: Box<dyn Worker>,
|
||||||
state: WorkerState,
|
state: WorkerState,
|
||||||
errors: usize,
|
errors: usize,
|
||||||
|
@ -225,33 +212,19 @@ impl WorkerHandler {
|
||||||
},
|
},
|
||||||
WorkerState::Throttled(delay) => {
|
WorkerState::Throttled(delay) => {
|
||||||
// Sleep for given delay and go back to busy state
|
// Sleep for given delay and go back to busy state
|
||||||
if !*self.stop_signal.borrow() {
|
|
||||||
select! {
|
select! {
|
||||||
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (),
|
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => {
|
||||||
|
self.state = WorkerState::Busy;
|
||||||
|
}
|
||||||
_ = self.stop_signal.changed() => (),
|
_ = self.stop_signal.changed() => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.state = WorkerState::Busy;
|
|
||||||
}
|
|
||||||
WorkerState::Idle => {
|
WorkerState::Idle => {
|
||||||
if *self.stop_signal.borrow() {
|
|
||||||
select! {
|
select! {
|
||||||
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
|
new_st = self.worker.wait_for_work() => {
|
||||||
self.state = new_st;
|
self.state = new_st;
|
||||||
}
|
}
|
||||||
_ = tokio::time::sleep(Duration::from_secs(1)) => {
|
_ = self.stop_signal.changed() => (),
|
||||||
// stay in Idle state
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
select! {
|
|
||||||
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
|
|
||||||
self.state = new_st;
|
|
||||||
}
|
|
||||||
_ = self.stop_signal.changed() => {
|
|
||||||
// stay in Idle state
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
WorkerState::Done => unreachable!(),
|
WorkerState::Done => unreachable!(),
|
||||||
|
|
Loading…
Add table
Reference in a new issue