Merge pull request 'Some improvements to Garage internals' (#451) from internals-rework into main

Reviewed-on: Deuxfleurs/garage#451
This commit is contained in:
Alex 2023-01-03 11:37:31 +00:00
commit 582b076179
27 changed files with 442 additions and 559 deletions

1
Cargo.lock generated
View file

@ -1243,6 +1243,7 @@ dependencies = [
name = "garage_table" name = "garage_table"
version = "0.8.1" version = "0.8.1"
dependencies = [ dependencies = [
"arc-swap",
"async-trait", "async-trait",
"bytes", "bytes",
"futures", "futures",

View file

@ -32,7 +32,7 @@ args@{
ignoreLockHash, ignoreLockHash,
}: }:
let let
nixifiedLockHash = "463114c4544bfa9b442a43afc6b39eb588f5720825c7a246ba9188c4bdb52944"; nixifiedLockHash = "4639f63ff4c54c01f66ec3d0d362f6905456dd768d6e94df1a7367c763721fd7";
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
lockHashIgnored = if ignoreLockHash lockHashIgnored = if ignoreLockHash
@ -1769,6 +1769,7 @@ in
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/table"); src = fetchCrateLocal (workspaceSrc + "/src/table");
dependencies = { dependencies = {
arc_swap = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".arc-swap."1.5.0" { inherit profileName; }).out;
async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.52" { profileName = "__noProfile"; }).out; async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.52" { profileName = "__noProfile"; }).out;
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.2.0" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.2.0" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.21" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.21" { inherit profileName; }).out;

View file

@ -3,6 +3,7 @@ use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use arc_swap::ArcSwapOption;
use async_trait::async_trait; use async_trait::async_trait;
use bytes::Bytes; use bytes::Bytes;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -22,6 +23,7 @@ use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream};
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
@ -87,7 +89,7 @@ pub struct BlockManager {
pub(crate) metrics: BlockManagerMetrics, pub(crate) metrics: BlockManagerMetrics,
tx_scrub_command: mpsc::Sender<ScrubWorkerCommand>, tx_scrub_command: ArcSwapOption<mpsc::Sender<ScrubWorkerCommand>>,
} }
#[derive(Serialize, Deserialize, Clone, Debug)] #[derive(Serialize, Deserialize, Clone, Debug)]
@ -126,8 +128,6 @@ impl BlockManager {
let metrics = let metrics =
BlockManagerMetrics::new(rc.rc.clone(), resync.queue.clone(), resync.errors.clone()); BlockManagerMetrics::new(rc.rc.clone(), resync.queue.clone(), resync.errors.clone());
let (scrub_tx, scrub_rx) = mpsc::channel(1);
let block_manager = Arc::new(Self { let block_manager = Arc::new(Self {
replication, replication,
data_dir, data_dir,
@ -138,21 +138,24 @@ impl BlockManager {
system, system,
endpoint, endpoint,
metrics, metrics,
tx_scrub_command: scrub_tx, tx_scrub_command: ArcSwapOption::new(None),
}); });
block_manager.endpoint.set_handler(block_manager.clone()); block_manager.endpoint.set_handler(block_manager.clone());
block_manager
}
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
// Spawn a bunch of resync workers // Spawn a bunch of resync workers
for index in 0..MAX_RESYNC_WORKERS { for index in 0..MAX_RESYNC_WORKERS {
let worker = ResyncWorker::new(index, block_manager.clone()); let worker = ResyncWorker::new(index, self.clone());
block_manager.system.background.spawn_worker(worker); bg.spawn_worker(worker);
} }
// Spawn scrub worker // Spawn scrub worker
let scrub_worker = ScrubWorker::new(block_manager.clone(), scrub_rx); let (scrub_tx, scrub_rx) = mpsc::channel(1);
block_manager.system.background.spawn_worker(scrub_worker); self.tx_scrub_command.store(Some(Arc::new(scrub_tx)));
bg.spawn_worker(ScrubWorker::new(self.clone(), scrub_rx));
block_manager
} }
/// Ask nodes that might have a (possibly compressed) block for it /// Ask nodes that might have a (possibly compressed) block for it
@ -325,8 +328,11 @@ impl BlockManager {
} }
/// Send command to start/stop/manager scrub worker /// Send command to start/stop/manager scrub worker
pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) { pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) -> Result<(), Error> {
let _ = self.tx_scrub_command.send(cmd).await; let tx = self.tx_scrub_command.load();
let tx = tx.as_ref().ok_or_message("scrub worker is not running")?;
tx.send(cmd).await.ok_or_message("send error")?;
Ok(())
} }
/// Get the reference count of a block /// Get the reference count of a block

View file

@ -148,7 +148,7 @@ impl Worker for RepairWorker {
} }
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
unreachable!() unreachable!()
} }
} }
@ -341,7 +341,7 @@ impl Worker for ScrubWorker {
} }
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
let (wait_until, command) = match &self.work { let (wait_until, command) = match &self.work {
ScrubWorkerState::Running(_) => return WorkerState::Busy, ScrubWorkerState::Running(_) => return WorkerState::Busy,
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume), ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),

View file

@ -540,7 +540,7 @@ impl Worker for ResyncWorker {
} }
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
while self.index >= self.manager.resync.persisted.load().n_workers { while self.index >= self.manager.resync.persisted.load().n_workers {
self.manager.resync.notify.notified().await self.manager.resync.notify.notified().await
} }

View file

@ -5,6 +5,7 @@ use std::sync::Arc;
use async_trait::async_trait; use async_trait::async_trait;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use garage_util::background::BackgroundRunner;
use garage_util::crdt::*; use garage_util::crdt::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
@ -74,13 +75,18 @@ impl Rpc for AdminRpc {
pub struct AdminRpcHandler { pub struct AdminRpcHandler {
garage: Arc<Garage>, garage: Arc<Garage>,
background: Arc<BackgroundRunner>,
endpoint: Arc<Endpoint<AdminRpc, Self>>, endpoint: Arc<Endpoint<AdminRpc, Self>>,
} }
impl AdminRpcHandler { impl AdminRpcHandler {
pub fn new(garage: Arc<Garage>) -> Arc<Self> { pub fn new(garage: Arc<Garage>, background: Arc<BackgroundRunner>) -> Arc<Self> {
let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into()); let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into());
let admin = Arc::new(Self { garage, endpoint }); let admin = Arc::new(Self {
garage,
background,
endpoint,
});
admin.endpoint.set_handler(admin.clone()); admin.endpoint.set_handler(admin.clone());
admin admin
} }
@ -759,7 +765,7 @@ impl AdminRpcHandler {
))) )))
} }
} else { } else {
launch_online_repair(self.garage.clone(), opt).await; launch_online_repair(&self.garage, &self.background, opt).await?;
Ok(AdminRpc::Ok(format!( Ok(AdminRpc::Ok(format!(
"Repair launched on {:?}", "Repair launched on {:?}",
self.garage.system.id self.garage.system.id
@ -925,12 +931,11 @@ impl AdminRpcHandler {
async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> { async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> {
match cmd { match cmd {
WorkerOperation::List { opt } => { WorkerOperation::List { opt } => {
let workers = self.garage.background.get_worker_info(); let workers = self.background.get_worker_info();
Ok(AdminRpc::WorkerList(workers, *opt)) Ok(AdminRpc::WorkerList(workers, *opt))
} }
WorkerOperation::Info { tid } => { WorkerOperation::Info { tid } => {
let info = self let info = self
.garage
.background .background
.get_worker_info() .get_worker_info()
.get(tid) .get(tid)
@ -944,7 +949,7 @@ impl AdminRpcHandler {
self.garage self.garage
.block_manager .block_manager
.send_scrub_command(scrub_command) .send_scrub_command(scrub_command)
.await; .await?;
Ok(AdminRpc::Ok("Scrub tranquility updated".into())) Ok(AdminRpc::Ok("Scrub tranquility updated".into()))
} }
WorkerSetCmd::ResyncWorkerCount { worker_count } => { WorkerSetCmd::ResyncWorkerCount { worker_count } => {

View file

@ -1,8 +1,5 @@
use std::path::PathBuf; use std::path::PathBuf;
use tokio::sync::watch;
use garage_util::background::*;
use garage_util::config::*; use garage_util::config::*;
use garage_util::error::*; use garage_util::error::*;
@ -20,12 +17,8 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
info!("Loading configuration..."); info!("Loading configuration...");
let config = read_config(config_file)?; let config = read_config(config_file)?;
info!("Initializing background runner...");
let (done_tx, done_rx) = watch::channel(false);
let (background, await_background_done) = BackgroundRunner::new(16, done_rx);
info!("Initializing Garage main data store..."); info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone(), background)?; let garage = Garage::new(config)?;
info!("Launching repair operation..."); info!("Launching repair operation...");
match opt.what { match opt.what {
@ -43,13 +36,7 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
} }
} }
info!("Repair operation finished, shutting down Garage internals..."); info!("Repair operation finished, shutting down...");
done_tx.send(true).unwrap();
drop(garage);
await_background_done.await?;
info!("Cleaning up...");
Ok(()) Ok(())
} }

View file

@ -15,35 +15,33 @@ use garage_util::error::Error;
use crate::*; use crate::*;
pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) { pub async fn launch_online_repair(
garage: &Arc<Garage>,
bg: &BackgroundRunner,
opt: RepairOpt,
) -> Result<(), Error> {
match opt.what { match opt.what {
RepairWhat::Tables => { RepairWhat::Tables => {
info!("Launching a full sync of tables"); info!("Launching a full sync of tables");
garage.bucket_table.syncer.add_full_sync(); garage.bucket_table.syncer.add_full_sync()?;
garage.object_table.syncer.add_full_sync(); garage.object_table.syncer.add_full_sync()?;
garage.version_table.syncer.add_full_sync(); garage.version_table.syncer.add_full_sync()?;
garage.block_ref_table.syncer.add_full_sync(); garage.block_ref_table.syncer.add_full_sync()?;
garage.key_table.syncer.add_full_sync(); garage.key_table.syncer.add_full_sync()?;
} }
RepairWhat::Versions => { RepairWhat::Versions => {
info!("Repairing the versions table"); info!("Repairing the versions table");
garage bg.spawn_worker(RepairVersionsWorker::new(garage.clone()));
.background
.spawn_worker(RepairVersionsWorker::new(garage.clone()));
} }
RepairWhat::BlockRefs => { RepairWhat::BlockRefs => {
info!("Repairing the block refs table"); info!("Repairing the block refs table");
garage bg.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
.background
.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
} }
RepairWhat::Blocks => { RepairWhat::Blocks => {
info!("Repairing the stored blocks"); info!("Repairing the stored blocks");
garage bg.spawn_worker(garage_block::repair::RepairWorker::new(
.background garage.block_manager.clone(),
.spawn_worker(garage_block::repair::RepairWorker::new( ));
garage.block_manager.clone(),
));
} }
RepairWhat::Scrub { cmd } => { RepairWhat::Scrub { cmd } => {
let cmd = match cmd { let cmd = match cmd {
@ -56,9 +54,10 @@ pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) {
} }
}; };
info!("Sending command to scrub worker: {:?}", cmd); info!("Sending command to scrub worker: {:?}", cmd);
garage.block_manager.send_scrub_command(cmd).await; garage.block_manager.send_scrub_command(cmd).await?;
} }
} }
Ok(())
} }
// ---- // ----
@ -93,19 +92,14 @@ impl Worker for RepairVersionsWorker {
} }
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
let item_bytes = match self.garage.version_table.data.store.get_gt(&self.pos)? { let (item_bytes, next_pos) = match self.garage.version_table.data.store.get_gt(&self.pos)? {
Some((k, v)) => { Some((k, v)) => (v, k),
self.pos = k;
v
}
None => { None => {
info!("repair_versions: finished, done {}", self.counter); info!("repair_versions: finished, done {}", self.counter);
return Ok(WorkerState::Done); return Ok(WorkerState::Done);
} }
}; };
self.counter += 1;
let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?; let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?;
if !version.deleted.get() { if !version.deleted.get() {
let object = self let object = self
@ -134,10 +128,13 @@ impl Worker for RepairVersionsWorker {
} }
} }
self.counter += 1;
self.pos = next_pos;
Ok(WorkerState::Busy) Ok(WorkerState::Busy)
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
unreachable!() unreachable!()
} }
} }
@ -174,18 +171,14 @@ impl Worker for RepairBlockrefsWorker {
} }
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
let item_bytes = match self.garage.block_ref_table.data.store.get_gt(&self.pos)? { let (item_bytes, next_pos) =
Some((k, v)) => { match self.garage.block_ref_table.data.store.get_gt(&self.pos)? {
self.pos = k; Some((k, v)) => (v, k),
v None => {
} info!("repair_block_ref: finished, done {}", self.counter);
None => { return Ok(WorkerState::Done);
info!("repair_block_ref: finished, done {}", self.counter); }
return Ok(WorkerState::Done); };
}
};
self.counter += 1;
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?; let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?;
if !block_ref.deleted.get() { if !block_ref.deleted.get() {
@ -212,10 +205,13 @@ impl Worker for RepairBlockrefsWorker {
} }
} }
self.counter += 1;
self.pos = next_pos;
Ok(WorkerState::Busy) Ok(WorkerState::Busy)
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
unreachable!() unreachable!()
} }
} }

View file

@ -35,12 +35,15 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
#[cfg(feature = "metrics")] #[cfg(feature = "metrics")]
let metrics_exporter = opentelemetry_prometheus::exporter().init(); let metrics_exporter = opentelemetry_prometheus::exporter().init();
info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone())?;
info!("Initializing background runner..."); info!("Initializing background runner...");
let watch_cancel = watch_shutdown_signal(); let watch_cancel = watch_shutdown_signal();
let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
info!("Initializing Garage main data store..."); info!("Spawning Garage workers...");
let garage = Garage::new(config.clone(), background)?; garage.spawn_workers(&background);
if config.admin.trace_sink.is_some() { if config.admin.trace_sink.is_some() {
info!("Initialize tracing..."); info!("Initialize tracing...");
@ -63,7 +66,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone())); let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
info!("Create admin RPC handler..."); info!("Create admin RPC handler...");
AdminRpcHandler::new(garage.clone()); AdminRpcHandler::new(garage.clone(), background.clone());
// ---- Launch public-facing API servers ---- // ---- Launch public-facing API servers ----

View file

@ -39,8 +39,6 @@ pub struct Garage {
/// The local database /// The local database
pub db: db::Db, pub db: db::Db,
/// A background job runner
pub background: Arc<BackgroundRunner>,
/// The membership manager /// The membership manager
pub system: Arc<System>, pub system: Arc<System>,
/// The block manager /// The block manager
@ -78,7 +76,7 @@ pub struct GarageK2V {
impl Garage { impl Garage {
/// Create and run garage /// Create and run garage
pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> { pub fn new(config: Config) -> Result<Arc<Self>, Error> {
// Create meta dir and data dir if they don't exist already // Create meta dir and data dir if they don't exist already
std::fs::create_dir_all(&config.metadata_dir) std::fs::create_dir_all(&config.metadata_dir)
.ok_or_message("Unable to create Garage metadata directory")?; .ok_or_message("Unable to create Garage metadata directory")?;
@ -167,7 +165,7 @@ impl Garage {
.expect("Invalid replication_mode in config file."); .expect("Invalid replication_mode in config file.");
info!("Initialize membership management system..."); info!("Initialize membership management system...");
let system = System::new(network_key, background.clone(), replication_mode, &config)?; let system = System::new(network_key, replication_mode, &config)?;
let data_rep_param = TableShardedReplication { let data_rep_param = TableShardedReplication {
system: system.clone(), system: system.clone(),
@ -225,7 +223,6 @@ impl Garage {
info!("Initialize version_table..."); info!("Initialize version_table...");
let version_table = Table::new( let version_table = Table::new(
VersionTable { VersionTable {
background: background.clone(),
block_ref_table: block_ref_table.clone(), block_ref_table: block_ref_table.clone(),
}, },
meta_rep_param.clone(), meta_rep_param.clone(),
@ -240,7 +237,6 @@ impl Garage {
#[allow(clippy::redundant_clone)] #[allow(clippy::redundant_clone)]
let object_table = Table::new( let object_table = Table::new(
ObjectTable { ObjectTable {
background: background.clone(),
version_table: version_table.clone(), version_table: version_table.clone(),
object_counter_table: object_counter_table.clone(), object_counter_table: object_counter_table.clone(),
}, },
@ -258,7 +254,6 @@ impl Garage {
config, config,
replication_mode, replication_mode,
db, db,
background,
system, system,
block_manager, block_manager,
bucket_table, bucket_table,
@ -273,6 +268,22 @@ impl Garage {
})) }))
} }
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
self.block_manager.spawn_workers(bg);
self.bucket_table.spawn_workers(bg);
self.bucket_alias_table.spawn_workers(bg);
self.key_table.spawn_workers(bg);
self.object_table.spawn_workers(bg);
self.object_counter_table.spawn_workers(bg);
self.version_table.spawn_workers(bg);
self.block_ref_table.spawn_workers(bg);
#[cfg(feature = "k2v")]
self.k2v.spawn_workers(bg);
}
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper { pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
helper::bucket::BucketHelper(self) helper::bucket::BucketHelper(self)
} }
@ -307,4 +318,9 @@ impl GarageK2V {
rpc, rpc,
} }
} }
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
self.item_table.spawn_workers(bg);
self.counter_table.spawn_workers(bg);
}
} }

View file

@ -1,17 +1,15 @@
use core::ops::Bound; use core::ops::Bound;
use std::collections::{hash_map, BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc; use std::sync::Arc;
use async_trait::async_trait;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::sync::{mpsc, watch};
use garage_db as db; use garage_db as db;
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_util::background::*; use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::time::*; use garage_util::time::*;
@ -142,7 +140,6 @@ impl<T: CountedItem> TableSchema for CounterTable<T> {
pub struct IndexCounter<T: CountedItem> { pub struct IndexCounter<T: CountedItem> {
this_node: Uuid, this_node: Uuid,
local_counter: db::Tree, local_counter: db::Tree,
propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry<T>)>,
pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>, pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>,
} }
@ -152,16 +149,11 @@ impl<T: CountedItem> IndexCounter<T> {
replication: TableShardedReplication, replication: TableShardedReplication,
db: &db::Db, db: &db::Db,
) -> Arc<Self> { ) -> Arc<Self> {
let background = system.background.clone(); Arc::new(Self {
let (propagate_tx, propagate_rx) = mpsc::unbounded_channel();
let this = Arc::new(Self {
this_node: system.id, this_node: system.id,
local_counter: db local_counter: db
.open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME)) .open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME))
.expect("Unable to open local counter tree"), .expect("Unable to open local counter tree"),
propagate_tx,
table: Table::new( table: Table::new(
CounterTable { CounterTable {
_phantom_t: Default::default(), _phantom_t: Default::default(),
@ -170,16 +162,11 @@ impl<T: CountedItem> IndexCounter<T> {
system, system,
db, db,
), ),
}); })
}
background.spawn_worker(IndexPropagatorWorker { pub fn spawn_workers(&self, bg: &BackgroundRunner) {
index_counter: this.clone(), self.table.spawn_workers(bg);
propagate_rx,
buf: HashMap::new(),
errors: 0,
});
this
} }
pub fn count( pub fn count(
@ -232,12 +219,8 @@ impl<T: CountedItem> IndexCounter<T> {
.map_err(db::TxError::Abort)?; .map_err(db::TxError::Abort)?;
tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?; tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?;
if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) { let dist_entry = entry.into_counter_entry(self.this_node);
error!( self.table.queue_insert(tx, &dist_entry)?;
"Could not propagate updated counter values, failed to send to channel: {}",
e
);
}
Ok(()) Ok(())
} }
@ -250,23 +233,6 @@ impl<T: CountedItem> IndexCounter<T> {
TS: TableSchema<E = T>, TS: TableSchema<E = T>,
TR: TableReplication, TR: TableReplication,
{ {
let save_counter_entry = |entry: CounterEntry<T>| -> Result<(), Error> {
let entry_k = self
.table
.data
.tree_key(entry.partition_key(), entry.sort_key());
self.table
.data
.update_entry_with(&entry_k, |ent| match ent {
Some(mut ent) => {
ent.merge(&entry);
ent
}
None => entry.clone(),
})?;
Ok(())
};
// 1. Set all old local counters to zero // 1. Set all old local counters to zero
let now = now_msec(); let now = now_msec();
let mut next_start: Option<Vec<u8>> = None; let mut next_start: Option<Vec<u8>> = None;
@ -302,7 +268,9 @@ impl<T: CountedItem> IndexCounter<T> {
.insert(&local_counter_k, &local_counter_bytes)?; .insert(&local_counter_k, &local_counter_bytes)?;
let counter_entry = local_counter.into_counter_entry(self.this_node); let counter_entry = local_counter.into_counter_entry(self.this_node);
save_counter_entry(counter_entry)?; self.local_counter
.db()
.transaction(|mut tx| self.table.queue_insert(&mut tx, &counter_entry))?;
next_start = Some(local_counter_k); next_start = Some(local_counter_k);
} }
@ -367,7 +335,9 @@ impl<T: CountedItem> IndexCounter<T> {
.insert(&local_counter_key, local_counter_bytes)?; .insert(&local_counter_key, local_counter_bytes)?;
let counter_entry = local_counter.into_counter_entry(self.this_node); let counter_entry = local_counter.into_counter_entry(self.this_node);
save_counter_entry(counter_entry)?; self.local_counter
.db()
.transaction(|mut tx| self.table.queue_insert(&mut tx, &counter_entry))?;
next_start = Some(counted_entry_k); next_start = Some(counted_entry_k);
} }
@ -378,96 +348,7 @@ impl<T: CountedItem> IndexCounter<T> {
} }
} }
struct IndexPropagatorWorker<T: CountedItem> { // ----
index_counter: Arc<IndexCounter<T>>,
propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry<T>)>,
buf: HashMap<Vec<u8>, CounterEntry<T>>,
errors: usize,
}
impl<T: CountedItem> IndexPropagatorWorker<T> {
fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry<T>) {
let tree_key = self.index_counter.table.data.tree_key(&pk, &sk);
let dist_entry = counters.into_counter_entry(self.index_counter.this_node);
match self.buf.entry(tree_key) {
hash_map::Entry::Vacant(e) => {
e.insert(dist_entry);
}
hash_map::Entry::Occupied(mut e) => {
e.get_mut().merge(&dist_entry);
}
}
}
}
#[async_trait]
impl<T: CountedItem> Worker for IndexPropagatorWorker<T> {
fn name(&self) -> String {
format!("{} counter", T::COUNTER_TABLE_NAME)
}
fn status(&self) -> WorkerStatus {
WorkerStatus {
queue_length: Some(self.buf.len() as u64),
..Default::default()
}
}
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
// This loop batches updates to counters to be sent all at once.
// They are sent once the propagate_rx channel has been emptied (or is closed).
let closed = loop {
match self.propagate_rx.try_recv() {
Ok((pk, sk, counters)) => {
self.add_ent(pk, sk, counters);
}
Err(mpsc::error::TryRecvError::Empty) => break false,
Err(mpsc::error::TryRecvError::Disconnected) => break true,
}
};
if !self.buf.is_empty() {
let entries_k = self.buf.keys().take(100).cloned().collect::<Vec<_>>();
let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap());
if let Err(e) = self.index_counter.table.insert_many(entries).await {
self.errors += 1;
if self.errors >= 2 && *must_exit.borrow() {
error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e);
return Ok(WorkerState::Done);
}
// Propagate error up to worker manager, it will log it, increment a counter,
// and sleep for a certain delay (with exponential backoff), waiting for
// things to go back to normal
return Err(e);
} else {
for k in entries_k {
self.buf.remove(&k);
}
self.errors = 0;
}
return Ok(WorkerState::Busy);
} else if closed {
return Ok(WorkerState::Done);
} else {
return Ok(WorkerState::Idle);
}
}
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
match self.propagate_rx.recv().await {
Some((pk, sk, counters)) => {
self.add_ent(pk, sk, counters);
WorkerState::Busy
}
None => match self.buf.is_empty() {
false => WorkerState::Busy,
true => WorkerState::Done,
},
}
}
}
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
struct LocalCounterEntry<T: CountedItem> { struct LocalCounterEntry<T: CountedItem> {

View file

@ -273,14 +273,9 @@ impl K2VRpcHandler {
} }
fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> { fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> {
let tree_key = self
.item_table
.data
.tree_key(&item.partition, &item.sort_key);
self.item_table self.item_table
.data .data
.update_entry_with(&tree_key[..], |ent| { .update_entry_with(&item.partition, &item.sort_key, |ent| {
let mut ent = ent.unwrap_or_else(|| { let mut ent = ent.unwrap_or_else(|| {
K2VItem::new( K2VItem::new(
item.partition.bucket_id, item.partition.bucket_id,

View file

@ -4,7 +4,6 @@ use std::sync::Arc;
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::*; use garage_table::crdt::*;
@ -221,7 +220,6 @@ impl Crdt for Object {
} }
pub struct ObjectTable { pub struct ObjectTable {
pub background: Arc<BackgroundRunner>,
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
pub object_counter_table: Arc<IndexCounter<Object>>, pub object_counter_table: Arc<IndexCounter<Object>>,
} }
@ -255,34 +253,34 @@ impl TableSchema for ObjectTable {
); );
} }
// 2. Spawn threads that propagates deletions to version table // 2. Enqueue propagation deletions to version table
let version_table = self.version_table.clone(); if let (Some(old_v), Some(new_v)) = (old, new) {
let old = old.cloned(); // Propagate deletion of old versions
let new = new.cloned(); for v in old_v.versions.iter() {
let newly_deleted = match new_v
self.background.spawn(async move { .versions
if let (Some(old_v), Some(new_v)) = (old, new) { .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key()))
// Propagate deletion of old versions {
for v in old_v.versions.iter() { Err(_) => true,
let newly_deleted = match new_v Ok(i) => {
.versions new_v.versions[i].state == ObjectVersionState::Aborted
.binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) && v.state != ObjectVersionState::Aborted
{ }
Err(_) => true, };
Ok(i) => { if newly_deleted {
new_v.versions[i].state == ObjectVersionState::Aborted let deleted_version =
&& v.state != ObjectVersionState::Aborted Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true);
} let res = self.version_table.queue_insert(tx, &deleted_version);
}; if let Err(e) = db::unabort(res)? {
if newly_deleted { error!(
let deleted_version = "Unable to enqueue version deletion propagation: {}. A repair will be needed.",
Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); e
version_table.insert(&deleted_version).await?; );
} }
} }
} }
Ok(()) }
});
Ok(()) Ok(())
} }

View file

@ -3,7 +3,6 @@ use std::sync::Arc;
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::*; use garage_table::crdt::*;
@ -127,7 +126,6 @@ impl Crdt for Version {
} }
pub struct VersionTable { pub struct VersionTable {
pub background: Arc<BackgroundRunner>,
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>, pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
} }
@ -141,33 +139,26 @@ impl TableSchema for VersionTable {
fn updated( fn updated(
&self, &self,
_tx: &mut db::Transaction, tx: &mut db::Transaction,
old: Option<&Self::E>, old: Option<&Self::E>,
new: Option<&Self::E>, new: Option<&Self::E>,
) -> db::TxOpResult<()> { ) -> db::TxOpResult<()> {
let block_ref_table = self.block_ref_table.clone(); if let (Some(old_v), Some(new_v)) = (old, new) {
let old = old.cloned(); // Propagate deletion of version blocks
let new = new.cloned(); if new_v.deleted.get() && !old_v.deleted.get() {
let deleted_block_refs = old_v.blocks.items().iter().map(|(_k, vb)| BlockRef {
self.background.spawn(async move { block: vb.hash,
if let (Some(old_v), Some(new_v)) = (old, new) { version: old_v.uuid,
// Propagate deletion of version blocks deleted: true.into(),
if new_v.deleted.get() && !old_v.deleted.get() { });
let deleted_block_refs = old_v for block_ref in deleted_block_refs {
.blocks let res = self.block_ref_table.queue_insert(tx, &block_ref);
.items() if let Err(e) = db::unabort(res)? {
.iter() error!("Unable to enqueue block ref deletion propagation: {}. A repair will be needed.", e);
.map(|(_k, vb)| BlockRef { }
block: vb.hash,
version: old_v.uuid,
deleted: true.into(),
})
.collect::<Vec<_>>();
block_ref_table.insert_many(&deleted_block_refs[..]).await?;
} }
} }
Ok(()) }
});
Ok(()) Ok(())
} }

View file

@ -5,7 +5,6 @@ use std::time::Duration;
use futures::future::join_all; use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt; use futures::stream::StreamExt;
use futures_util::future::FutureExt;
use tokio::select; use tokio::select;
use tokio::sync::watch; use tokio::sync::watch;
@ -24,7 +23,6 @@ pub use netapp::message::{
use netapp::peering::fullmesh::FullMeshPeeringStrategy; use netapp::peering::fullmesh::FullMeshPeeringStrategy;
pub use netapp::{self, NetApp, NodeID}; pub use netapp::{self, NetApp, NodeID};
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
@ -94,7 +92,6 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
struct RpcHelperInner { struct RpcHelperInner {
our_node_id: Uuid, our_node_id: Uuid,
fullmesh: Arc<FullMeshPeeringStrategy>, fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>, ring: watch::Receiver<Arc<Ring>>,
metrics: RpcMetrics, metrics: RpcMetrics,
rpc_timeout: Duration, rpc_timeout: Duration,
@ -104,7 +101,6 @@ impl RpcHelper {
pub(crate) fn new( pub(crate) fn new(
our_node_id: Uuid, our_node_id: Uuid,
fullmesh: Arc<FullMeshPeeringStrategy>, fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>, ring: watch::Receiver<Arc<Ring>>,
rpc_timeout: Option<Duration>, rpc_timeout: Option<Duration>,
) -> Self { ) -> Self {
@ -113,7 +109,6 @@ impl RpcHelper {
Self(Arc::new(RpcHelperInner { Self(Arc::new(RpcHelperInner {
our_node_id, our_node_id,
fullmesh, fullmesh,
background,
ring, ring,
metrics, metrics,
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
@ -377,16 +372,13 @@ impl RpcHelper {
if !resp_stream.is_empty() { if !resp_stream.is_empty() {
// Continue remaining requests in background. // Continue remaining requests in background.
// Continue the remaining requests immediately using tokio::spawn // Note: these requests can get interrupted on process shutdown,
// but enqueue a task in the background runner // we must not count on them being executed for certain.
// to ensure that the process won't exit until the requests are done // For all background things that have to happen with certainty,
// (if we had just enqueued the resp_stream.collect directly in the background runner, // they have to be put in a proper queue that is persisted to disk.
// the requests might have been put on hold in the background runner's queue, tokio::spawn(async move {
// in which case they might timeout or otherwise fail)
let wait_finished_fut = tokio::spawn(async move {
resp_stream.collect::<Vec<Result<_, _>>>().await; resp_stream.collect::<Vec<Result<_, _>>>().await;
}); });
self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
} }
} }

View file

@ -21,7 +21,6 @@ use netapp::peering::fullmesh::FullMeshPeeringStrategy;
use netapp::util::parse_and_resolve_peer_addr_async; use netapp::util::parse_and_resolve_peer_addr_async;
use netapp::{NetApp, NetworkKey, NodeID, NodeKey}; use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
use garage_util::background::BackgroundRunner;
use garage_util::config::Config; use garage_util::config::Config;
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
use garage_util::config::KubernetesDiscoveryConfig; use garage_util::config::KubernetesDiscoveryConfig;
@ -50,8 +49,6 @@ pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008
/// RPC endpoint used for calls related to membership /// RPC endpoint used for calls related to membership
pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
pub const CONNECT_ERROR_MESSAGE: &str = "Error establishing RPC connection to remote node. This can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret";
/// RPC messages related to membership /// RPC messages related to membership
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub enum SystemRpc { pub enum SystemRpc {
@ -110,9 +107,6 @@ pub struct System {
pub ring: watch::Receiver<Arc<Ring>>, pub ring: watch::Receiver<Arc<Ring>>,
update_ring: Mutex<watch::Sender<Arc<Ring>>>, update_ring: Mutex<watch::Sender<Arc<Ring>>>,
/// The job runner of this node
pub background: Arc<BackgroundRunner>,
/// Path to metadata directory /// Path to metadata directory
pub metadata_dir: PathBuf, pub metadata_dir: PathBuf,
} }
@ -232,7 +226,6 @@ impl System {
/// Create this node's membership manager /// Create this node's membership manager
pub fn new( pub fn new(
network_key: NetworkKey, network_key: NetworkKey,
background: Arc<BackgroundRunner>,
replication_mode: ReplicationMode, replication_mode: ReplicationMode,
config: &Config, config: &Config,
) -> Result<Arc<Self>, Error> { ) -> Result<Arc<Self>, Error> {
@ -354,7 +347,6 @@ impl System {
rpc: RpcHelper::new( rpc: RpcHelper::new(
netapp.id.into(), netapp.id.into(),
fullmesh, fullmesh,
background.clone(),
ring.clone(), ring.clone(),
config.rpc_timeout_msec.map(Duration::from_millis), config.rpc_timeout_msec.map(Duration::from_millis),
), ),
@ -372,7 +364,6 @@ impl System {
ring, ring,
update_ring: Mutex::new(update_ring), update_ring: Mutex::new(update_ring),
background,
metadata_dir: config.metadata_dir.clone(), metadata_dir: config.metadata_dir.clone(),
}); });
sys.system_endpoint.set_handler(sys.clone()); sys.system_endpoint.set_handler(sys.clone());
@ -444,17 +435,14 @@ impl System {
)) ))
})?; })?;
let mut errors = vec![]; let mut errors = vec![];
for ip in addrs.iter() { for addr in addrs.iter() {
match self match self.netapp.clone().try_connect(*addr, pubkey).await {
.netapp
.clone()
.try_connect(*ip, pubkey)
.await
.err_context(CONNECT_ERROR_MESSAGE)
{
Ok(()) => return Ok(()), Ok(()) => return Ok(()),
Err(e) => { Err(e) => {
errors.push((*ip, e)); errors.push((
*addr,
Error::Message(connect_error_message(*addr, pubkey, e)),
));
} }
} }
} }
@ -578,7 +566,7 @@ impl System {
} }
/// Save network configuration to disc /// Save network configuration to disc
async fn save_cluster_layout(self: Arc<Self>) -> Result<(), Error> { async fn save_cluster_layout(&self) -> Result<(), Error> {
let ring: Arc<Ring> = self.ring.borrow().clone(); let ring: Arc<Ring> = self.ring.borrow().clone();
self.persist_cluster_layout self.persist_cluster_layout
.save_async(&ring.layout) .save_async(&ring.layout)
@ -630,11 +618,7 @@ impl System {
if info.cluster_layout_version > local_info.cluster_layout_version if info.cluster_layout_version > local_info.cluster_layout_version
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
{ {
let self2 = self.clone(); tokio::spawn(self.clone().pull_cluster_layout(from));
self.background.spawn_cancellable(async move {
self2.pull_cluster_layout(from).await;
Ok(())
});
} }
self.node_status self.node_status
@ -676,18 +660,21 @@ impl System {
drop(update_ring); drop(update_ring);
let self2 = self.clone(); let self2 = self.clone();
self.background.spawn_cancellable(async move { tokio::spawn(async move {
self2 if let Err(e) = self2
.rpc .rpc
.broadcast( .broadcast(
&self2.system_endpoint, &self2.system_endpoint,
SystemRpc::AdvertiseClusterLayout(layout), SystemRpc::AdvertiseClusterLayout(layout),
RequestStrategy::with_priority(PRIO_HIGH), RequestStrategy::with_priority(PRIO_HIGH),
) )
.await?; .await
Ok(()) {
warn!("Error while broadcasting new cluster layout: {}", e);
}
}); });
self.background.spawn(self.clone().save_cluster_layout());
self.save_cluster_layout().await?;
} }
Ok(SystemRpc::Ok) Ok(SystemRpc::Ok)
@ -773,12 +760,12 @@ impl System {
} }
for (node_id, node_addr) in ping_list { for (node_id, node_addr) in ping_list {
tokio::spawn( let self2 = self.clone();
self.netapp tokio::spawn(async move {
.clone() if let Err(e) = self2.netapp.clone().try_connect(node_addr, node_id).await {
.try_connect(node_addr, node_id) error!("{}", connect_error_message(node_addr, node_id, e));
.map(|r| r.err_context(CONNECT_ERROR_MESSAGE)), }
); });
} }
} }
@ -787,11 +774,10 @@ impl System {
} }
#[cfg(feature = "consul-discovery")] #[cfg(feature = "consul-discovery")]
self.background.spawn(self.clone().advertise_to_consul()); background::spawn(self.clone().advertise_to_consul());
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
self.background background::spawn(self.clone().advertise_to_kubernetes());
.spawn(self.clone().advertise_to_kubernetes());
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL); let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
select! { select! {
@ -881,3 +867,11 @@ async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
ret ret
} }
fn connect_error_message(
addr: SocketAddr,
pubkey: ed25519::PublicKey,
e: netapp::error::Error,
) -> String {
format!("Error establishing RPC connection to remote node: {}@{}.\nThis can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret.\n{}", hex::encode(pubkey), addr, e)
}

View file

@ -21,6 +21,7 @@ garage_util = { version = "0.8.1", path = "../util" }
opentelemetry = "0.17" opentelemetry = "0.17"
async-trait = "0.1.7" async-trait = "0.1.7"
arc-swap = "1.0"
bytes = "1.0" bytes = "1.0"
hex = "0.4" hex = "0.4"
hexdump = "0.1" hexdump = "0.1"

View file

@ -31,6 +31,10 @@ pub struct TableData<F: TableSchema, R: TableReplication> {
pub(crate) merkle_tree: db::Tree, pub(crate) merkle_tree: db::Tree,
pub(crate) merkle_todo: db::Tree, pub(crate) merkle_todo: db::Tree,
pub(crate) merkle_todo_notify: Notify, pub(crate) merkle_todo_notify: Notify,
pub(crate) insert_queue: db::Tree,
pub(crate) insert_queue_notify: Notify,
pub(crate) gc_todo: CountedTree, pub(crate) gc_todo: CountedTree,
pub(crate) metrics: TableMetrics, pub(crate) metrics: TableMetrics,
@ -53,9 +57,13 @@ where
.open_tree(&format!("{}:merkle_todo", F::TABLE_NAME)) .open_tree(&format!("{}:merkle_todo", F::TABLE_NAME))
.expect("Unable to open DB Merkle TODO tree"); .expect("Unable to open DB Merkle TODO tree");
let insert_queue = db
.open_tree(&format!("{}:insert_queue", F::TABLE_NAME))
.expect("Unable to open insert queue DB tree");
let gc_todo = db let gc_todo = db
.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME)) .open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
.expect("Unable to open DB tree"); .expect("Unable to open GC DB tree");
let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2"); let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");
let metrics = TableMetrics::new( let metrics = TableMetrics::new(
@ -74,6 +82,8 @@ where
merkle_tree, merkle_tree,
merkle_todo, merkle_todo,
merkle_todo_notify: Notify::new(), merkle_todo_notify: Notify::new(),
insert_queue,
insert_queue_notify: Notify::new(),
gc_todo, gc_todo,
metrics, metrics,
}) })
@ -173,9 +183,8 @@ where
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> { pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
let update = self.decode_entry(update_bytes)?; let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
self.update_entry_with(&tree_key[..], |ent| match ent { self.update_entry_with(update.partition_key(), update.sort_key(), |ent| match ent {
Some(mut ent) => { Some(mut ent) => {
ent.merge(&update); ent.merge(&update);
ent ent
@ -187,11 +196,14 @@ where
pub fn update_entry_with( pub fn update_entry_with(
&self, &self,
tree_key: &[u8], partition_key: &F::P,
sort_key: &F::S,
f: impl Fn(Option<F::E>) -> F::E, f: impl Fn(Option<F::E>) -> F::E,
) -> Result<Option<F::E>, Error> { ) -> Result<Option<F::E>, Error> {
let tree_key = self.tree_key(partition_key, sort_key);
let changed = self.store.db().transaction(|mut tx| { let changed = self.store.db().transaction(|mut tx| {
let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, tree_key)? { let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, &tree_key)? {
Some(old_bytes) => { Some(old_bytes) => {
let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?; let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?;
let new_entry = f(Some(old_entry.clone())); let new_entry = f(Some(old_entry.clone()));
@ -200,23 +212,23 @@ where
None => (None, None, f(None)), None => (None, None, f(None)),
}; };
// Scenario 1: the value changed, so of course there is a change // Changed can be true in two scenarios
let value_changed = Some(&new_entry) != old_entry.as_ref(); // Scenario 1: the actual represented value changed,
// so of course the messagepack encoding changed as well
// Scenario 2: the value didn't change but due to a migration in the // Scenario 2: the value didn't change but due to a migration in the
// data format, the messagepack encoding changed. In this case // data format, the messagepack encoding changed. In this case,
// we have to write the migrated value in the table and update // we also have to write the migrated value in the table and update
// the associated Merkle tree entry. // the associated Merkle tree entry.
let new_bytes = rmp_to_vec_all_named(&new_entry) let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RmpEncode) .map_err(Error::RmpEncode)
.map_err(db::TxError::Abort)?; .map_err(db::TxError::Abort)?;
let encoding_changed = Some(&new_bytes[..]) != old_bytes.as_ref().map(|x| &x[..]); let changed = Some(&new_bytes[..]) != old_bytes.as_deref();
drop(old_bytes); drop(old_bytes);
if value_changed || encoding_changed { if changed {
let new_bytes_hash = blake2sum(&new_bytes[..]); let new_bytes_hash = blake2sum(&new_bytes);
tx.insert(&self.merkle_todo, tree_key, new_bytes_hash.as_slice())?; tx.insert(&self.merkle_todo, &tree_key, new_bytes_hash.as_slice())?;
tx.insert(&self.store, tree_key, new_bytes)?; tx.insert(&self.store, &tree_key, new_bytes)?;
self.instance self.instance
.updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?; .updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?;
@ -242,7 +254,7 @@ where
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap(); let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
let nodes = self.replication.write_nodes(&pk_hash); let nodes = self.replication.write_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) { if nodes.first() == Some(&self.system.id) {
GcTodoEntry::new(tree_key.to_vec(), new_bytes_hash).save(&self.gc_todo)?; GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
} }
} }
@ -258,10 +270,11 @@ where
.db() .db()
.transaction(|mut tx| match tx.get(&self.store, k)? { .transaction(|mut tx| match tx.get(&self.store, k)? {
Some(cur_v) if cur_v == v => { Some(cur_v) if cur_v == v => {
let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;
tx.remove(&self.store, k)?; tx.remove(&self.store, k)?;
tx.insert(&self.merkle_todo, k, vec![])?; tx.insert(&self.merkle_todo, k, vec![])?;
let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;
self.instance.updated(&mut tx, Some(&old_entry), None)?; self.instance.updated(&mut tx, Some(&old_entry), None)?;
Ok(true) Ok(true)
} }
@ -285,10 +298,11 @@ where
.db() .db()
.transaction(|mut tx| match tx.get(&self.store, k)? { .transaction(|mut tx| match tx.get(&self.store, k)? {
Some(cur_v) if blake2sum(&cur_v[..]) == vhash => { Some(cur_v) if blake2sum(&cur_v[..]) == vhash => {
let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;
tx.remove(&self.store, k)?; tx.remove(&self.store, k)?;
tx.insert(&self.merkle_todo, k, vec![])?; tx.insert(&self.merkle_todo, k, vec![])?;
let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;
self.instance.updated(&mut tx, Some(&old_entry), None)?; self.instance.updated(&mut tx, Some(&old_entry), None)?;
Ok(true) Ok(true)
} }
@ -302,6 +316,32 @@ where
Ok(removed) Ok(removed)
} }
// ---- Insert queue functions ----
pub(crate) fn queue_insert(
&self,
tx: &mut db::Transaction,
ins: &F::E,
) -> db::TxResult<(), Error> {
let tree_key = self.tree_key(ins.partition_key(), ins.sort_key());
let new_entry = match tx.get(&self.insert_queue, &tree_key)? {
Some(old_v) => {
let mut entry = self.decode_entry(&old_v).map_err(db::TxError::Abort)?;
entry.merge(ins);
rmp_to_vec_all_named(&entry)
}
None => rmp_to_vec_all_named(ins),
};
let new_entry = new_entry
.map_err(Error::RmpEncode)
.map_err(db::TxError::Abort)?;
tx.insert(&self.insert_queue, &tree_key, new_entry)?;
self.insert_queue_notify.notify_one();
Ok(())
}
// ---- Utility functions ---- // ---- Utility functions ----
pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> { pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {

View file

@ -54,24 +54,25 @@ where
F: TableSchema + 'static, F: TableSchema + 'static,
R: TableReplication + 'static, R: TableReplication + 'static,
{ {
pub(crate) fn launch(system: Arc<System>, data: Arc<TableData<F, R>>) -> Arc<Self> { pub(crate) fn new(system: Arc<System>, data: Arc<TableData<F, R>>) -> Arc<Self> {
let endpoint = system let endpoint = system
.netapp .netapp
.endpoint(format!("garage_table/gc.rs/Rpc:{}", F::TABLE_NAME)); .endpoint(format!("garage_table/gc.rs/Rpc:{}", F::TABLE_NAME));
let gc = Arc::new(Self { let gc = Arc::new(Self {
system: system.clone(), system,
data, data,
endpoint, endpoint,
}); });
gc.endpoint.set_handler(gc.clone()); gc.endpoint.set_handler(gc.clone());
system.background.spawn_worker(GcWorker::new(gc.clone()));
gc gc
} }
pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
bg.spawn_worker(GcWorker::new(self.clone()));
}
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> { async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
let now = now_msec(); let now = now_msec();
@ -347,10 +348,7 @@ where
} }
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
tokio::time::sleep(self.wait_delay).await; tokio::time::sleep(self.wait_delay).await;
WorkerState::Busy WorkerState::Busy
} }

View file

@ -4,16 +4,18 @@
#[macro_use] #[macro_use]
extern crate tracing; extern crate tracing;
mod metrics;
pub mod schema; pub mod schema;
pub mod util; pub mod util;
pub mod data; pub mod data;
pub mod replication;
pub mod table;
mod gc; mod gc;
mod merkle; mod merkle;
pub mod replication; mod metrics;
mod queue;
mod sync; mod sync;
pub mod table;
pub use schema::*; pub use schema::*;
pub use table::*; pub use table::*;

View file

@ -3,6 +3,7 @@ use std::time::Duration;
use async_trait::async_trait; use async_trait::async_trait;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::select;
use tokio::sync::watch; use tokio::sync::watch;
use garage_db as db; use garage_db as db;
@ -69,17 +70,17 @@ where
F: TableSchema + 'static, F: TableSchema + 'static,
R: TableReplication + 'static, R: TableReplication + 'static,
{ {
pub(crate) fn launch(background: &BackgroundRunner, data: Arc<TableData<F, R>>) -> Arc<Self> { pub(crate) fn new(data: Arc<TableData<F, R>>) -> Arc<Self> {
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]); let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
let ret = Arc::new(Self { Arc::new(Self {
data, data,
empty_node_hash, empty_node_hash,
}); })
}
background.spawn_worker(MerkleWorker(ret.clone())); pub(crate) fn spawn_workers(self: &Arc<Self>, background: &BackgroundRunner) {
background.spawn_worker(MerkleWorker(self.clone()));
ret
} }
fn updater_loop_iter(&self) -> Result<WorkerState, Error> { fn updater_loop_iter(&self) -> Result<WorkerState, Error> {
@ -339,11 +340,11 @@ where
.unwrap() .unwrap()
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() { select! {
return WorkerState::Done; _ = tokio::time::sleep(Duration::from_secs(60)) => (),
_ = self.0.data.merkle_todo_notify.notified() => (),
} }
tokio::time::sleep(Duration::from_secs(10)).await;
WorkerState::Busy WorkerState::Busy
} }
} }

81
src/table/queue.rs Normal file
View file

@ -0,0 +1,81 @@
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use tokio::select;
use tokio::sync::watch;
use garage_util::background::*;
use garage_util::error::Error;
use crate::replication::*;
use crate::schema::*;
use crate::table::*;
const BATCH_SIZE: usize = 100;
pub(crate) struct InsertQueueWorker<F, R>(pub(crate) Arc<Table<F, R>>)
where
F: TableSchema + 'static,
R: TableReplication + 'static;
#[async_trait]
impl<F, R> Worker for InsertQueueWorker<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
fn name(&self) -> String {
format!("{} queue", F::TABLE_NAME)
}
fn status(&self) -> WorkerStatus {
WorkerStatus {
queue_length: Some(self.0.data.insert_queue.len().unwrap_or(0) as u64),
..Default::default()
}
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
let mut kv_pairs = vec![];
let mut values = vec![];
for entry_kv in self.0.data.insert_queue.iter()? {
let (k, v) = entry_kv?;
values.push(self.0.data.decode_entry(&v)?);
kv_pairs.push((k, v));
if kv_pairs.len() > BATCH_SIZE {
break;
}
}
if kv_pairs.is_empty() {
return Ok(WorkerState::Idle);
}
self.0.insert_many(values).await?;
self.0.data.insert_queue.db().transaction(|mut tx| {
for (k, v) in kv_pairs.iter() {
if let Some(v2) = tx.get(&self.0.data.insert_queue, k)? {
if &v2 == v {
tx.remove(&self.0.data.insert_queue, k)?;
}
}
}
Ok(())
})?;
Ok(WorkerState::Busy)
}
async fn wait_for_work(&mut self) -> WorkerState {
select! {
_ = tokio::time::sleep(Duration::from_secs(600)) => (),
_ = self.0.data.insert_queue_notify.notified() => (),
}
WorkerState::Busy
}
}

View file

@ -2,6 +2,7 @@ use std::collections::VecDeque;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use arc_swap::ArcSwapOption;
use async_trait::async_trait; use async_trait::async_trait;
use futures_util::stream::*; use futures_util::stream::*;
use opentelemetry::KeyValue; use opentelemetry::KeyValue;
@ -13,7 +14,7 @@ use tokio::sync::{mpsc, watch};
use garage_util::background::*; use garage_util::background::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::{Error, OkOrMessage};
use garage_rpc::ring::*; use garage_rpc::ring::*;
use garage_rpc::system::System; use garage_rpc::system::System;
@ -32,7 +33,7 @@ pub struct TableSyncer<F: TableSchema + 'static, R: TableReplication + 'static>
data: Arc<TableData<F, R>>, data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>, merkle: Arc<MerkleUpdater<F, R>>,
add_full_sync_tx: mpsc::UnboundedSender<()>, add_full_sync_tx: ArcSwapOption<mpsc::UnboundedSender<()>>,
endpoint: Arc<Endpoint<SyncRpc, Self>>, endpoint: Arc<Endpoint<SyncRpc, Self>>,
} }
@ -65,7 +66,7 @@ where
F: TableSchema + 'static, F: TableSchema + 'static,
R: TableReplication + 'static, R: TableReplication + 'static,
{ {
pub(crate) fn launch( pub(crate) fn new(
system: Arc<System>, system: Arc<System>,
data: Arc<TableData<F, R>>, data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>, merkle: Arc<MerkleUpdater<F, R>>,
@ -74,34 +75,40 @@ where
.netapp .netapp
.endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME)); .endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME));
let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
let syncer = Arc::new(Self { let syncer = Arc::new(Self {
system: system.clone(), system,
data, data,
merkle, merkle,
add_full_sync_tx, add_full_sync_tx: ArcSwapOption::new(None),
endpoint, endpoint,
}); });
syncer.endpoint.set_handler(syncer.clone()); syncer.endpoint.set_handler(syncer.clone());
system.background.spawn_worker(SyncWorker {
syncer: syncer.clone(),
ring_recv: system.ring.clone(),
ring: system.ring.borrow().clone(),
add_full_sync_rx,
todo: vec![],
next_full_sync: Instant::now() + Duration::from_secs(20),
});
syncer syncer
} }
pub fn add_full_sync(&self) { pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
if self.add_full_sync_tx.send(()).is_err() { let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
error!("({}) Could not add full sync", F::TABLE_NAME); self.add_full_sync_tx
} .store(Some(Arc::new(add_full_sync_tx)));
bg.spawn_worker(SyncWorker {
syncer: self.clone(),
ring_recv: self.system.ring.clone(),
ring: self.system.ring.borrow().clone(),
add_full_sync_rx,
todo: vec![],
next_full_sync: Instant::now() + Duration::from_secs(20),
});
}
pub fn add_full_sync(&self) -> Result<(), Error> {
let tx = self.add_full_sync_tx.load();
let tx = tx
.as_ref()
.ok_or_message("table sync worker is not running")?;
tx.send(()).ok_or_message("send error")?;
Ok(())
} }
// ---- // ----
@ -586,10 +593,7 @@ impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWor
} }
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
select! { select! {
s = self.add_full_sync_rx.recv() => { s = self.add_full_sync_rx.recv() => {
if let Some(()) = s { if let Some(()) = s {

View file

@ -14,6 +14,7 @@ use opentelemetry::{
use garage_db as db; use garage_db as db;
use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
@ -25,6 +26,7 @@ use crate::crdt::Crdt;
use crate::data::*; use crate::data::*;
use crate::gc::*; use crate::gc::*;
use crate::merkle::*; use crate::merkle::*;
use crate::queue::InsertQueueWorker;
use crate::replication::*; use crate::replication::*;
use crate::schema::*; use crate::schema::*;
use crate::sync::*; use crate::sync::*;
@ -35,6 +37,7 @@ pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
pub data: Arc<TableData<F, R>>, pub data: Arc<TableData<F, R>>,
pub merkle_updater: Arc<MerkleUpdater<F, R>>, pub merkle_updater: Arc<MerkleUpdater<F, R>>,
pub syncer: Arc<TableSyncer<F, R>>, pub syncer: Arc<TableSyncer<F, R>>,
gc: Arc<TableGc<F, R>>,
endpoint: Arc<Endpoint<TableRpc<F>, Self>>, endpoint: Arc<Endpoint<TableRpc<F>, Self>>,
} }
@ -75,15 +78,16 @@ where
let data = TableData::new(system.clone(), instance, replication, db); let data = TableData::new(system.clone(), instance, replication, db);
let merkle_updater = MerkleUpdater::launch(&system.background, data.clone()); let merkle_updater = MerkleUpdater::new(data.clone());
let syncer = TableSyncer::launch(system.clone(), data.clone(), merkle_updater.clone()); let syncer = TableSyncer::new(system.clone(), data.clone(), merkle_updater.clone());
TableGc::launch(system.clone(), data.clone()); let gc = TableGc::new(system.clone(), data.clone());
let table = Arc::new(Self { let table = Arc::new(Self {
system, system,
data, data,
merkle_updater, merkle_updater,
gc,
syncer, syncer,
endpoint, endpoint,
}); });
@ -93,6 +97,13 @@ where
table table
} }
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
self.merkle_updater.spawn_workers(bg);
self.syncer.spawn_workers(bg);
self.gc.spawn_workers(bg);
bg.spawn_worker(InsertQueueWorker(self.clone()));
}
pub async fn insert(&self, e: &F::E) -> Result<(), Error> { pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let tracer = opentelemetry::global::tracer("garage_table"); let tracer = opentelemetry::global::tracer("garage_table");
let span = tracer.start(format!("{} insert", F::TABLE_NAME)); let span = tracer.start(format!("{} insert", F::TABLE_NAME));
@ -128,6 +139,11 @@ where
Ok(()) Ok(())
} }
/// Insert item locally
pub fn queue_insert(&self, tx: &mut db::Transaction, e: &F::E) -> db::TxResult<(), Error> {
self.data.queue_insert(tx, e)
}
pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error> pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error>
where where
I: IntoIterator<Item = IE> + Send + Sync, I: IntoIterator<Item = IE> + Send + Sync,
@ -259,9 +275,11 @@ where
if not_all_same { if not_all_same {
let self2 = self.clone(); let self2 = self.clone();
let ent2 = ret_entry.clone(); let ent2 = ret_entry.clone();
self.system tokio::spawn(async move {
.background if let Err(e) = self2.repair_on_read(&who[..], ent2).await {
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await }); warn!("Error doing repair on read: {}", e);
}
});
} }
} }
@ -358,11 +376,12 @@ where
.into_iter() .into_iter()
.map(|k| ret.get(&k).unwrap().clone()) .map(|k| ret.get(&k).unwrap().clone())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
self.system.background.spawn_cancellable(async move { tokio::spawn(async move {
for v in to_repair { for v in to_repair {
self2.repair_on_read(&who[..], v).await?; if let Err(e) = self2.repair_on_read(&who[..], v).await {
warn!("Error doing repair on read: {}", e);
}
} }
Ok(())
}); });
} }

View file

@ -1,48 +0,0 @@
//! Job worker: a generic worker that just processes incoming
//! jobs one by one
use std::sync::Arc;
use async_trait::async_trait;
use tokio::sync::{mpsc, Mutex};
use crate::background::worker::*;
use crate::background::*;
pub(crate) struct JobWorker {
pub(crate) index: usize,
pub(crate) job_chan: Arc<Mutex<mpsc::UnboundedReceiver<(Job, bool)>>>,
pub(crate) next_job: Option<Job>,
}
#[async_trait]
impl Worker for JobWorker {
fn name(&self) -> String {
format!("Job worker #{}", self.index)
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
match self.next_job.take() {
None => return Ok(WorkerState::Idle),
Some(job) => {
job.await?;
Ok(WorkerState::Busy)
}
}
}
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
loop {
match self.job_chan.lock().await.recv().await {
Some((job, cancellable)) => {
if cancellable && *must_exit.borrow() {
continue;
}
self.next_job = Some(job);
return WorkerState::Busy;
}
None => return WorkerState::Done,
}
}
}
}

View file

@ -1,27 +1,18 @@
//! Job runner for futures and async functions //! Job runner for futures and async functions
pub mod job_worker;
pub mod worker; pub mod worker;
use core::future::Future;
use std::collections::HashMap; use std::collections::HashMap;
use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::sync::{mpsc, watch, Mutex}; use tokio::sync::{mpsc, watch};
use crate::error::Error;
use worker::WorkerProcessor; use worker::WorkerProcessor;
pub use worker::{Worker, WorkerState}; pub use worker::{Worker, WorkerState};
pub(crate) type JobOutput = Result<(), Error>;
pub(crate) type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
/// Job runner for futures and async functions /// Job runner for futures and async functions
pub struct BackgroundRunner { pub struct BackgroundRunner {
send_job: mpsc::UnboundedSender<(Job, bool)>,
send_worker: mpsc::UnboundedSender<Box<dyn Worker>>, send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>, worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
} }
@ -49,10 +40,7 @@ pub struct WorkerStatus {
impl BackgroundRunner { impl BackgroundRunner {
/// Create a new BackgroundRunner /// Create a new BackgroundRunner
pub fn new( pub fn new(stop_signal: watch::Receiver<bool>) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
n_runners: usize,
stop_signal: watch::Receiver<bool>,
) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>(); let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new())); let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
@ -63,24 +51,7 @@ impl BackgroundRunner {
worker_processor.run().await; worker_processor.run().await;
}); });
let (send_job, queue_out) = mpsc::unbounded_channel();
let queue_out = Arc::new(Mutex::new(queue_out));
for i in 0..n_runners {
let queue_out = queue_out.clone();
send_worker
.send(Box::new(job_worker::JobWorker {
index: i,
job_chan: queue_out.clone(),
next_job: None,
}))
.ok()
.unwrap();
}
let bgrunner = Arc::new(Self { let bgrunner = Arc::new(Self {
send_job,
send_worker, send_worker,
worker_info, worker_info,
}); });
@ -91,31 +62,6 @@ impl BackgroundRunner {
self.worker_info.lock().unwrap().clone() self.worker_info.lock().unwrap().clone()
} }
/// Spawn a task to be run in background
pub fn spawn<T>(&self, job: T)
where
T: Future<Output = JobOutput> + Send + 'static,
{
let boxed: Job = Box::pin(job);
self.send_job
.send((boxed, false))
.ok()
.expect("Could not put job in queue");
}
/// Spawn a task to be run in background. It may get discarded before running if spawned while
/// the runner is stopping
pub fn spawn_cancellable<T>(&self, job: T)
where
T: Future<Output = JobOutput> + Send + 'static,
{
let boxed: Job = Box::pin(job);
self.send_job
.send((boxed, true))
.ok()
.expect("Could not put job in queue");
}
pub fn spawn_worker<W>(&self, worker: W) pub fn spawn_worker<W>(&self, worker: W)
where where
W: Worker + 'static, W: Worker + 'static,

View file

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::Duration;
use async_trait::async_trait; use async_trait::async_trait;
use futures::future::*; use futures::future::*;
@ -14,6 +14,10 @@ use crate::background::{WorkerInfo, WorkerStatus};
use crate::error::Error; use crate::error::Error;
use crate::time::now_msec; use crate::time::now_msec;
// All workers that haven't exited for this time after an exit signal was recieved
// will be interrupted in the middle of whatever they are doing.
const EXIT_DEADLINE: Duration = Duration::from_secs(8);
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)] #[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
pub enum WorkerState { pub enum WorkerState {
Busy, Busy,
@ -50,10 +54,8 @@ pub trait Worker: Send {
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>; async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
/// Wait for work: await for some task to become available. This future can be interrupted in /// Wait for work: await for some task to become available. This future can be interrupted in
/// the middle for any reason. This future doesn't have to await on must_exit.changed(), we /// the middle for any reason, for example if an interrupt signal was recieved.
/// are doing it for you. Therefore it only receives a read refernce to must_exit which allows async fn wait_for_work(&mut self) -> WorkerState;
/// it to check if we are exiting.
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
} }
pub(crate) struct WorkerProcessor { pub(crate) struct WorkerProcessor {
@ -93,11 +95,9 @@ impl WorkerProcessor {
let task_id = next_task_id; let task_id = next_task_id;
next_task_id += 1; next_task_id += 1;
let stop_signal = self.stop_signal.clone(); let stop_signal = self.stop_signal.clone();
let stop_signal_worker = self.stop_signal.clone();
let mut worker = WorkerHandler { let mut worker = WorkerHandler {
task_id, task_id,
stop_signal, stop_signal,
stop_signal_worker,
worker: new_worker, worker: new_worker,
state: WorkerState::Busy, state: WorkerState::Busy,
errors: 0, errors: 0,
@ -153,26 +153,14 @@ impl WorkerProcessor {
} }
// We are exiting, drain everything // We are exiting, drain everything
let drain_half_time = Instant::now() + Duration::from_secs(5);
let drain_everything = async move { let drain_everything = async move {
while let Some(mut worker) = workers.next().await { while let Some(worker) = workers.next().await {
if worker.state == WorkerState::Done { info!(
info!( "Worker {} (TID {}) exited (last state: {:?})",
"Worker {} (TID {}) exited", worker.worker.name(),
worker.worker.name(), worker.task_id,
worker.task_id worker.state
); );
} else if Instant::now() > drain_half_time {
warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
} else {
workers.push(
async move {
worker.step().await;
worker
}
.boxed(),
);
}
} }
}; };
@ -180,7 +168,7 @@ impl WorkerProcessor {
_ = drain_everything => { _ = drain_everything => {
info!("All workers exited peacefully \\o/"); info!("All workers exited peacefully \\o/");
} }
_ = tokio::time::sleep(Duration::from_secs(9)) => { _ = tokio::time::sleep(EXIT_DEADLINE) => {
error!("Some workers could not exit in time, we are cancelling some things in the middle"); error!("Some workers could not exit in time, we are cancelling some things in the middle");
} }
} }
@ -190,7 +178,6 @@ impl WorkerProcessor {
struct WorkerHandler { struct WorkerHandler {
task_id: usize, task_id: usize,
stop_signal: watch::Receiver<bool>, stop_signal: watch::Receiver<bool>,
stop_signal_worker: watch::Receiver<bool>,
worker: Box<dyn Worker>, worker: Box<dyn Worker>,
state: WorkerState, state: WorkerState,
errors: usize, errors: usize,
@ -225,33 +212,19 @@ impl WorkerHandler {
}, },
WorkerState::Throttled(delay) => { WorkerState::Throttled(delay) => {
// Sleep for given delay and go back to busy state // Sleep for given delay and go back to busy state
if !*self.stop_signal.borrow() { select! {
select! { _ = tokio::time::sleep(Duration::from_secs_f32(delay)) => {
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (), self.state = WorkerState::Busy;
_ = self.stop_signal.changed() => (),
} }
_ = self.stop_signal.changed() => (),
} }
self.state = WorkerState::Busy;
} }
WorkerState::Idle => { WorkerState::Idle => {
if *self.stop_signal.borrow() { select! {
select! { new_st = self.worker.wait_for_work() => {
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => { self.state = new_st;
self.state = new_st;
}
_ = tokio::time::sleep(Duration::from_secs(1)) => {
// stay in Idle state
}
}
} else {
select! {
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
self.state = new_st;
}
_ = self.stop_signal.changed() => {
// stay in Idle state
}
} }
_ = self.stop_signal.changed() => (),
} }
} }
WorkerState::Done => unreachable!(), WorkerState::Done => unreachable!(),