Refactor background runner and get rid of job worker
continuous-integration/drone/push Build is failing Details
continuous-integration/drone/pr Build is failing Details

This commit is contained in:
Alex 2022-12-14 12:51:16 +01:00
parent 2183518edc
commit d56c472712
Signed by: lx
GPG Key ID: 0E496D15096376BE
16 changed files with 89 additions and 213 deletions

View File

@ -23,6 +23,7 @@ use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream};
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
@ -144,19 +145,17 @@ impl BlockManager {
block_manager block_manager
} }
pub fn spawn_workers(self: &Arc<Self>) { pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
// Spawn a bunch of resync workers // Spawn a bunch of resync workers
for index in 0..MAX_RESYNC_WORKERS { for index in 0..MAX_RESYNC_WORKERS {
let worker = ResyncWorker::new(index, self.clone()); let worker = ResyncWorker::new(index, self.clone());
self.system.background.spawn_worker(worker); bg.spawn_worker(worker);
} }
// Spawn scrub worker // Spawn scrub worker
let (scrub_tx, scrub_rx) = mpsc::channel(1); let (scrub_tx, scrub_rx) = mpsc::channel(1);
self.tx_scrub_command.store(Some(Arc::new(scrub_tx))); self.tx_scrub_command.store(Some(Arc::new(scrub_tx)));
self.system bg.spawn_worker(ScrubWorker::new(self.clone(), scrub_rx));
.background
.spawn_worker(ScrubWorker::new(self.clone(), scrub_rx));
} }
/// Ask nodes that might have a (possibly compressed) block for it /// Ask nodes that might have a (possibly compressed) block for it

View File

@ -5,6 +5,7 @@ use std::sync::Arc;
use async_trait::async_trait; use async_trait::async_trait;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use garage_util::background::BackgroundRunner;
use garage_util::crdt::*; use garage_util::crdt::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
@ -74,13 +75,18 @@ impl Rpc for AdminRpc {
pub struct AdminRpcHandler { pub struct AdminRpcHandler {
garage: Arc<Garage>, garage: Arc<Garage>,
background: Arc<BackgroundRunner>,
endpoint: Arc<Endpoint<AdminRpc, Self>>, endpoint: Arc<Endpoint<AdminRpc, Self>>,
} }
impl AdminRpcHandler { impl AdminRpcHandler {
pub fn new(garage: Arc<Garage>) -> Arc<Self> { pub fn new(garage: Arc<Garage>, background: Arc<BackgroundRunner>) -> Arc<Self> {
let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into()); let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into());
let admin = Arc::new(Self { garage, endpoint }); let admin = Arc::new(Self {
garage,
background,
endpoint,
});
admin.endpoint.set_handler(admin.clone()); admin.endpoint.set_handler(admin.clone());
admin admin
} }
@ -759,7 +765,7 @@ impl AdminRpcHandler {
))) )))
} }
} else { } else {
launch_online_repair(self.garage.clone(), opt).await?; launch_online_repair(&self.garage, &self.background, opt).await?;
Ok(AdminRpc::Ok(format!( Ok(AdminRpc::Ok(format!(
"Repair launched on {:?}", "Repair launched on {:?}",
self.garage.system.id self.garage.system.id
@ -925,12 +931,11 @@ impl AdminRpcHandler {
async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> { async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> {
match cmd { match cmd {
WorkerOperation::List { opt } => { WorkerOperation::List { opt } => {
let workers = self.garage.background.get_worker_info(); let workers = self.background.get_worker_info();
Ok(AdminRpc::WorkerList(workers, *opt)) Ok(AdminRpc::WorkerList(workers, *opt))
} }
WorkerOperation::Info { tid } => { WorkerOperation::Info { tid } => {
let info = self let info = self
.garage
.background .background
.get_worker_info() .get_worker_info()
.get(tid) .get(tid)

View File

@ -1,8 +1,5 @@
use std::path::PathBuf; use std::path::PathBuf;
use tokio::sync::watch;
use garage_util::background::*;
use garage_util::config::*; use garage_util::config::*;
use garage_util::error::*; use garage_util::error::*;
@ -20,12 +17,8 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
info!("Loading configuration..."); info!("Loading configuration...");
let config = read_config(config_file)?; let config = read_config(config_file)?;
info!("Initializing background runner...");
let (done_tx, done_rx) = watch::channel(false);
let (background, await_background_done) = BackgroundRunner::new(16, done_rx);
info!("Initializing Garage main data store..."); info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone(), background)?; let garage = Garage::new(config)?;
info!("Launching repair operation..."); info!("Launching repair operation...");
match opt.what { match opt.what {
@ -43,13 +36,7 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
} }
} }
info!("Repair operation finished, shutting down Garage internals..."); info!("Repair operation finished, shutting down...");
done_tx.send(true).unwrap();
drop(garage);
await_background_done.await?;
info!("Cleaning up...");
Ok(()) Ok(())
} }

View File

@ -15,7 +15,11 @@ use garage_util::error::Error;
use crate::*; use crate::*;
pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) -> Result<(), Error> { pub async fn launch_online_repair(
garage: &Arc<Garage>,
bg: &BackgroundRunner,
opt: RepairOpt,
) -> Result<(), Error> {
match opt.what { match opt.what {
RepairWhat::Tables => { RepairWhat::Tables => {
info!("Launching a full sync of tables"); info!("Launching a full sync of tables");
@ -27,23 +31,17 @@ pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) -> Result
} }
RepairWhat::Versions => { RepairWhat::Versions => {
info!("Repairing the versions table"); info!("Repairing the versions table");
garage bg.spawn_worker(RepairVersionsWorker::new(garage.clone()));
.background
.spawn_worker(RepairVersionsWorker::new(garage.clone()));
} }
RepairWhat::BlockRefs => { RepairWhat::BlockRefs => {
info!("Repairing the block refs table"); info!("Repairing the block refs table");
garage bg.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
.background
.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
} }
RepairWhat::Blocks => { RepairWhat::Blocks => {
info!("Repairing the stored blocks"); info!("Repairing the stored blocks");
garage bg.spawn_worker(garage_block::repair::RepairWorker::new(
.background garage.block_manager.clone(),
.spawn_worker(garage_block::repair::RepairWorker::new( ));
garage.block_manager.clone(),
));
} }
RepairWhat::Scrub { cmd } => { RepairWhat::Scrub { cmd } => {
let cmd = match cmd { let cmd = match cmd {

View File

@ -35,15 +35,15 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
#[cfg(feature = "metrics")] #[cfg(feature = "metrics")]
let metrics_exporter = opentelemetry_prometheus::exporter().init(); let metrics_exporter = opentelemetry_prometheus::exporter().init();
info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone())?;
info!("Initializing background runner..."); info!("Initializing background runner...");
let watch_cancel = watch_shutdown_signal(); let watch_cancel = watch_shutdown_signal();
let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone(), background)?;
info!("Spawning Garage workers..."); info!("Spawning Garage workers...");
garage.spawn_workers(); garage.spawn_workers(&background);
if config.admin.trace_sink.is_some() { if config.admin.trace_sink.is_some() {
info!("Initialize tracing..."); info!("Initialize tracing...");
@ -66,7 +66,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone())); let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
info!("Create admin RPC handler..."); info!("Create admin RPC handler...");
AdminRpcHandler::new(garage.clone()); AdminRpcHandler::new(garage.clone(), background.clone());
// ---- Launch public-facing API servers ---- // ---- Launch public-facing API servers ----

View File

@ -39,8 +39,6 @@ pub struct Garage {
/// The local database /// The local database
pub db: db::Db, pub db: db::Db,
/// A background job runner
pub background: Arc<BackgroundRunner>,
/// The membership manager /// The membership manager
pub system: Arc<System>, pub system: Arc<System>,
/// The block manager /// The block manager
@ -78,7 +76,7 @@ pub struct GarageK2V {
impl Garage { impl Garage {
/// Create and run garage /// Create and run garage
pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> { pub fn new(config: Config) -> Result<Arc<Self>, Error> {
// Create meta dir and data dir if they don't exist already // Create meta dir and data dir if they don't exist already
std::fs::create_dir_all(&config.metadata_dir) std::fs::create_dir_all(&config.metadata_dir)
.ok_or_message("Unable to create Garage metadata directory")?; .ok_or_message("Unable to create Garage metadata directory")?;
@ -167,7 +165,7 @@ impl Garage {
.expect("Invalid replication_mode in config file."); .expect("Invalid replication_mode in config file.");
info!("Initialize membership management system..."); info!("Initialize membership management system...");
let system = System::new(network_key, background.clone(), replication_mode, &config)?; let system = System::new(network_key, replication_mode, &config)?;
let data_rep_param = TableShardedReplication { let data_rep_param = TableShardedReplication {
system: system.clone(), system: system.clone(),
@ -225,7 +223,6 @@ impl Garage {
info!("Initialize version_table..."); info!("Initialize version_table...");
let version_table = Table::new( let version_table = Table::new(
VersionTable { VersionTable {
background: background.clone(),
block_ref_table: block_ref_table.clone(), block_ref_table: block_ref_table.clone(),
}, },
meta_rep_param.clone(), meta_rep_param.clone(),
@ -240,7 +237,6 @@ impl Garage {
#[allow(clippy::redundant_clone)] #[allow(clippy::redundant_clone)]
let object_table = Table::new( let object_table = Table::new(
ObjectTable { ObjectTable {
background: background.clone(),
version_table: version_table.clone(), version_table: version_table.clone(),
object_counter_table: object_counter_table.clone(), object_counter_table: object_counter_table.clone(),
}, },
@ -258,7 +254,6 @@ impl Garage {
config, config,
replication_mode, replication_mode,
db, db,
background,
system, system,
block_manager, block_manager,
bucket_table, bucket_table,
@ -273,20 +268,20 @@ impl Garage {
})) }))
} }
pub fn spawn_workers(&self) { pub fn spawn_workers(&self, bg: &BackgroundRunner) {
self.block_manager.spawn_workers(); self.block_manager.spawn_workers(bg);
self.bucket_table.spawn_workers(); self.bucket_table.spawn_workers(bg);
self.bucket_alias_table.spawn_workers(); self.bucket_alias_table.spawn_workers(bg);
self.key_table.spawn_workers(); self.key_table.spawn_workers(bg);
self.object_table.spawn_workers(); self.object_table.spawn_workers(bg);
self.object_counter_table.spawn_workers(); self.object_counter_table.spawn_workers(bg);
self.version_table.spawn_workers(); self.version_table.spawn_workers(bg);
self.block_ref_table.spawn_workers(); self.block_ref_table.spawn_workers(bg);
#[cfg(feature = "k2v")] #[cfg(feature = "k2v")]
self.k2v.spawn_workers(); self.k2v.spawn_workers(bg);
} }
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper { pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
@ -324,8 +319,8 @@ impl GarageK2V {
} }
} }
pub fn spawn_workers(&self) { pub fn spawn_workers(&self, bg: &BackgroundRunner) {
self.item_table.spawn_workers(); self.item_table.spawn_workers(bg);
self.counter_table.spawn_workers(); self.counter_table.spawn_workers(bg);
} }
} }

View File

@ -9,6 +9,7 @@ use garage_db as db;
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::time::*; use garage_util::time::*;
@ -164,8 +165,8 @@ impl<T: CountedItem> IndexCounter<T> {
}) })
} }
pub fn spawn_workers(&self) { pub fn spawn_workers(&self, bg: &BackgroundRunner) {
self.table.spawn_workers(); self.table.spawn_workers(bg);
} }
pub fn count( pub fn count(

View File

@ -4,7 +4,6 @@ use std::sync::Arc;
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::*; use garage_table::crdt::*;
@ -221,7 +220,6 @@ impl Crdt for Object {
} }
pub struct ObjectTable { pub struct ObjectTable {
pub background: Arc<BackgroundRunner>,
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
pub object_counter_table: Arc<IndexCounter<Object>>, pub object_counter_table: Arc<IndexCounter<Object>>,
} }

View File

@ -3,7 +3,6 @@ use std::sync::Arc;
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::*; use garage_table::crdt::*;
@ -127,7 +126,6 @@ impl Crdt for Version {
} }
pub struct VersionTable { pub struct VersionTable {
pub background: Arc<BackgroundRunner>,
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>, pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
} }

View File

@ -5,7 +5,6 @@ use std::time::Duration;
use futures::future::join_all; use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt; use futures::stream::StreamExt;
use futures_util::future::FutureExt;
use tokio::select; use tokio::select;
use tokio::sync::watch; use tokio::sync::watch;
@ -24,7 +23,6 @@ pub use netapp::message::{
use netapp::peering::fullmesh::FullMeshPeeringStrategy; use netapp::peering::fullmesh::FullMeshPeeringStrategy;
pub use netapp::{self, NetApp, NodeID}; pub use netapp::{self, NetApp, NodeID};
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
@ -94,7 +92,6 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
struct RpcHelperInner { struct RpcHelperInner {
our_node_id: Uuid, our_node_id: Uuid,
fullmesh: Arc<FullMeshPeeringStrategy>, fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>, ring: watch::Receiver<Arc<Ring>>,
metrics: RpcMetrics, metrics: RpcMetrics,
rpc_timeout: Duration, rpc_timeout: Duration,
@ -104,7 +101,6 @@ impl RpcHelper {
pub(crate) fn new( pub(crate) fn new(
our_node_id: Uuid, our_node_id: Uuid,
fullmesh: Arc<FullMeshPeeringStrategy>, fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>, ring: watch::Receiver<Arc<Ring>>,
rpc_timeout: Option<Duration>, rpc_timeout: Option<Duration>,
) -> Self { ) -> Self {
@ -113,7 +109,6 @@ impl RpcHelper {
Self(Arc::new(RpcHelperInner { Self(Arc::new(RpcHelperInner {
our_node_id, our_node_id,
fullmesh, fullmesh,
background,
ring, ring,
metrics, metrics,
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
@ -377,16 +372,13 @@ impl RpcHelper {
if !resp_stream.is_empty() { if !resp_stream.is_empty() {
// Continue remaining requests in background. // Continue remaining requests in background.
// Continue the remaining requests immediately using tokio::spawn // Note: these requests can get interrupted on process shutdown,
// but enqueue a task in the background runner // we must not count on them being executed for certain.
// to ensure that the process won't exit until the requests are done // For all background things that have to happen with certainty,
// (if we had just enqueued the resp_stream.collect directly in the background runner, // they have to be put in a proper queue that is persisted to disk.
// the requests might have been put on hold in the background runner's queue, tokio::spawn(async move {
// in which case they might timeout or otherwise fail)
let wait_finished_fut = tokio::spawn(async move {
resp_stream.collect::<Vec<Result<_, _>>>().await; resp_stream.collect::<Vec<Result<_, _>>>().await;
}); });
self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
} }
} }

View File

@ -21,7 +21,7 @@ use netapp::peering::fullmesh::FullMeshPeeringStrategy;
use netapp::util::parse_and_resolve_peer_addr_async; use netapp::util::parse_and_resolve_peer_addr_async;
use netapp::{NetApp, NetworkKey, NodeID, NodeKey}; use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
use garage_util::background::BackgroundRunner; use garage_util::background::{self};
use garage_util::config::Config; use garage_util::config::Config;
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
use garage_util::config::KubernetesDiscoveryConfig; use garage_util::config::KubernetesDiscoveryConfig;
@ -110,9 +110,6 @@ pub struct System {
pub ring: watch::Receiver<Arc<Ring>>, pub ring: watch::Receiver<Arc<Ring>>,
update_ring: Mutex<watch::Sender<Arc<Ring>>>, update_ring: Mutex<watch::Sender<Arc<Ring>>>,
/// The job runner of this node
pub background: Arc<BackgroundRunner>,
/// Path to metadata directory /// Path to metadata directory
pub metadata_dir: PathBuf, pub metadata_dir: PathBuf,
} }
@ -232,7 +229,6 @@ impl System {
/// Create this node's membership manager /// Create this node's membership manager
pub fn new( pub fn new(
network_key: NetworkKey, network_key: NetworkKey,
background: Arc<BackgroundRunner>,
replication_mode: ReplicationMode, replication_mode: ReplicationMode,
config: &Config, config: &Config,
) -> Result<Arc<Self>, Error> { ) -> Result<Arc<Self>, Error> {
@ -354,7 +350,6 @@ impl System {
rpc: RpcHelper::new( rpc: RpcHelper::new(
netapp.id.into(), netapp.id.into(),
fullmesh, fullmesh,
background.clone(),
ring.clone(), ring.clone(),
config.rpc_timeout_msec.map(Duration::from_millis), config.rpc_timeout_msec.map(Duration::from_millis),
), ),
@ -372,7 +367,6 @@ impl System {
ring, ring,
update_ring: Mutex::new(update_ring), update_ring: Mutex::new(update_ring),
background,
metadata_dir: config.metadata_dir.clone(), metadata_dir: config.metadata_dir.clone(),
}); });
sys.system_endpoint.set_handler(sys.clone()); sys.system_endpoint.set_handler(sys.clone());
@ -578,7 +572,7 @@ impl System {
} }
/// Save network configuration to disc /// Save network configuration to disc
async fn save_cluster_layout(self: Arc<Self>) -> Result<(), Error> { async fn save_cluster_layout(&self) -> Result<(), Error> {
let ring: Arc<Ring> = self.ring.borrow().clone(); let ring: Arc<Ring> = self.ring.borrow().clone();
self.persist_cluster_layout self.persist_cluster_layout
.save_async(&ring.layout) .save_async(&ring.layout)
@ -631,7 +625,7 @@ impl System {
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
{ {
let self2 = self.clone(); let self2 = self.clone();
self.background.spawn_cancellable(async move { background::spawn(async move {
self2.pull_cluster_layout(from).await; self2.pull_cluster_layout(from).await;
Ok(()) Ok(())
}); });
@ -676,7 +670,7 @@ impl System {
drop(update_ring); drop(update_ring);
let self2 = self.clone(); let self2 = self.clone();
self.background.spawn_cancellable(async move { background::spawn(async move {
self2 self2
.rpc .rpc
.broadcast( .broadcast(
@ -687,7 +681,8 @@ impl System {
.await?; .await?;
Ok(()) Ok(())
}); });
self.background.spawn(self.clone().save_cluster_layout());
self.save_cluster_layout().await?;
} }
Ok(SystemRpc::Ok) Ok(SystemRpc::Ok)
@ -773,7 +768,7 @@ impl System {
} }
for (node_id, node_addr) in ping_list { for (node_id, node_addr) in ping_list {
tokio::spawn( background::spawn(
self.netapp self.netapp
.clone() .clone()
.try_connect(node_addr, node_id) .try_connect(node_addr, node_id)
@ -787,11 +782,10 @@ impl System {
} }
#[cfg(feature = "consul-discovery")] #[cfg(feature = "consul-discovery")]
self.background.spawn(self.clone().advertise_to_consul()); background::spawn(self.clone().advertise_to_consul());
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
self.background background::spawn(self.clone().advertise_to_kubernetes());
.spawn(self.clone().advertise_to_kubernetes());
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL); let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
select! { select! {

View File

@ -69,10 +69,8 @@ where
gc gc
} }
pub(crate) fn spawn_workers(self: &Arc<Self>) { pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
self.system bg.spawn_worker(GcWorker::new(self.clone()));
.background
.spawn_worker(GcWorker::new(self.clone()));
} }
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> { async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {

View File

@ -87,12 +87,12 @@ where
syncer syncer
} }
pub(crate) fn spawn_workers(self: &Arc<Self>) { pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel(); let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
self.add_full_sync_tx self.add_full_sync_tx
.store(Some(Arc::new(add_full_sync_tx))); .store(Some(Arc::new(add_full_sync_tx)));
self.system.background.spawn_worker(SyncWorker { bg.spawn_worker(SyncWorker {
syncer: self.clone(), syncer: self.clone(),
ring_recv: self.system.ring.clone(), ring_recv: self.system.ring.clone(),
ring: self.system.ring.borrow().clone(), ring: self.system.ring.borrow().clone(),

View File

@ -14,6 +14,7 @@ use opentelemetry::{
use garage_db as db; use garage_db as db;
use garage_util::background::{self, BackgroundRunner};
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
@ -96,13 +97,11 @@ where
table table
} }
pub fn spawn_workers(self: &Arc<Self>) { pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
self.merkle_updater.spawn_workers(&self.system.background); self.merkle_updater.spawn_workers(bg);
self.syncer.spawn_workers(); self.syncer.spawn_workers(bg);
self.gc.spawn_workers(); self.gc.spawn_workers(bg);
self.system bg.spawn_worker(InsertQueueWorker(self.clone()));
.background
.spawn_worker(InsertQueueWorker(self.clone()));
} }
pub async fn insert(&self, e: &F::E) -> Result<(), Error> { pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
@ -276,9 +275,7 @@ where
if not_all_same { if not_all_same {
let self2 = self.clone(); let self2 = self.clone();
let ent2 = ret_entry.clone(); let ent2 = ret_entry.clone();
self.system background::spawn(async move { self2.repair_on_read(&who[..], ent2).await });
.background
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
} }
} }
@ -375,7 +372,7 @@ where
.into_iter() .into_iter()
.map(|k| ret.get(&k).unwrap().clone()) .map(|k| ret.get(&k).unwrap().clone())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
self.system.background.spawn_cancellable(async move { background::spawn(async move {
for v in to_repair { for v in to_repair {
self2.repair_on_read(&who[..], v).await?; self2.repair_on_read(&who[..], v).await?;
} }

View File

@ -1,48 +0,0 @@
//! Job worker: a generic worker that just processes incoming
//! jobs one by one
use std::sync::Arc;
use async_trait::async_trait;
use tokio::sync::{mpsc, Mutex};
use crate::background::worker::*;
use crate::background::*;
pub(crate) struct JobWorker {
pub(crate) index: usize,
pub(crate) job_chan: Arc<Mutex<mpsc::UnboundedReceiver<(Job, bool)>>>,
pub(crate) next_job: Option<Job>,
}
#[async_trait]
impl Worker for JobWorker {
fn name(&self) -> String {
format!("Job worker #{}", self.index)
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
match self.next_job.take() {
None => return Ok(WorkerState::Idle),
Some(job) => {
job.await?;
Ok(WorkerState::Busy)
}
}
}
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
loop {
match self.job_chan.lock().await.recv().await {
Some((job, cancellable)) => {
if cancellable && *must_exit.borrow() {
continue;
}
self.next_job = Some(job);
return WorkerState::Busy;
}
None => return WorkerState::Done,
}
}
}
}

View File

@ -1,27 +1,23 @@
//! Job runner for futures and async functions //! Job runner for futures and async functions
pub mod job_worker;
pub mod worker; pub mod worker;
use core::future::Future; use core::future::Future;
use std::collections::HashMap; use std::collections::HashMap;
use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::sync::{mpsc, watch, Mutex}; use tokio::sync::{mpsc, watch};
use crate::error::Error; use crate::error::Error;
use worker::WorkerProcessor; use worker::WorkerProcessor;
pub use worker::{Worker, WorkerState}; pub use worker::{Worker, WorkerState};
pub(crate) type JobOutput = Result<(), Error>; pub(crate) type JobOutput = Result<(), Error>;
pub(crate) type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
/// Job runner for futures and async functions /// Job runner for futures and async functions
pub struct BackgroundRunner { pub struct BackgroundRunner {
send_job: mpsc::UnboundedSender<(Job, bool)>,
send_worker: mpsc::UnboundedSender<Box<dyn Worker>>, send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>, worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
} }
@ -49,10 +45,7 @@ pub struct WorkerStatus {
impl BackgroundRunner { impl BackgroundRunner {
/// Create a new BackgroundRunner /// Create a new BackgroundRunner
pub fn new( pub fn new(stop_signal: watch::Receiver<bool>) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
n_runners: usize,
stop_signal: watch::Receiver<bool>,
) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>(); let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new())); let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
@ -63,24 +56,7 @@ impl BackgroundRunner {
worker_processor.run().await; worker_processor.run().await;
}); });
let (send_job, queue_out) = mpsc::unbounded_channel();
let queue_out = Arc::new(Mutex::new(queue_out));
for i in 0..n_runners {
let queue_out = queue_out.clone();
send_worker
.send(Box::new(job_worker::JobWorker {
index: i,
job_chan: queue_out.clone(),
next_job: None,
}))
.ok()
.unwrap();
}
let bgrunner = Arc::new(Self { let bgrunner = Arc::new(Self {
send_job,
send_worker, send_worker,
worker_info, worker_info,
}); });
@ -91,31 +67,6 @@ impl BackgroundRunner {
self.worker_info.lock().unwrap().clone() self.worker_info.lock().unwrap().clone()
} }
/// Spawn a task to be run in background
pub fn spawn<T>(&self, job: T)
where
T: Future<Output = JobOutput> + Send + 'static,
{
let boxed: Job = Box::pin(job);
self.send_job
.send((boxed, false))
.ok()
.expect("Could not put job in queue");
}
/// Spawn a task to be run in background. It may get discarded before running if spawned while
/// the runner is stopping
pub fn spawn_cancellable<T>(&self, job: T)
where
T: Future<Output = JobOutput> + Send + 'static,
{
let boxed: Job = Box::pin(job);
self.send_job
.send((boxed, true))
.ok()
.expect("Could not put job in queue");
}
pub fn spawn_worker<W>(&self, worker: W) pub fn spawn_worker<W>(&self, worker: W)
where where
W: Worker + 'static, W: Worker + 'static,
@ -126,3 +77,14 @@ impl BackgroundRunner {
.expect("Could not put worker in queue"); .expect("Could not put worker in queue");
} }
} }
pub fn spawn<T>(job: T)
where
T: Future<Output = JobOutput> + Send + 'static,
{
tokio::spawn(async move {
if let Err(e) = job.await {
error!("{}", e);
}
});
}