Some improvements to Garage internals #451
16 changed files with 89 additions and 213 deletions
|
@ -23,6 +23,7 @@ use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream};
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
use garage_util::metrics::RecordDuration;
|
use garage_util::metrics::RecordDuration;
|
||||||
|
@ -144,19 +145,17 @@ impl BlockManager {
|
||||||
block_manager
|
block_manager
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_workers(self: &Arc<Self>) {
|
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
// Spawn a bunch of resync workers
|
// Spawn a bunch of resync workers
|
||||||
for index in 0..MAX_RESYNC_WORKERS {
|
for index in 0..MAX_RESYNC_WORKERS {
|
||||||
let worker = ResyncWorker::new(index, self.clone());
|
let worker = ResyncWorker::new(index, self.clone());
|
||||||
self.system.background.spawn_worker(worker);
|
bg.spawn_worker(worker);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Spawn scrub worker
|
// Spawn scrub worker
|
||||||
let (scrub_tx, scrub_rx) = mpsc::channel(1);
|
let (scrub_tx, scrub_rx) = mpsc::channel(1);
|
||||||
self.tx_scrub_command.store(Some(Arc::new(scrub_tx)));
|
self.tx_scrub_command.store(Some(Arc::new(scrub_tx)));
|
||||||
self.system
|
bg.spawn_worker(ScrubWorker::new(self.clone(), scrub_rx));
|
||||||
.background
|
|
||||||
.spawn_worker(ScrubWorker::new(self.clone(), scrub_rx));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ask nodes that might have a (possibly compressed) block for it
|
/// Ask nodes that might have a (possibly compressed) block for it
|
||||||
|
|
|
@ -5,6 +5,7 @@ use std::sync::Arc;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::crdt::*;
|
use garage_util::crdt::*;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error as GarageError;
|
use garage_util::error::Error as GarageError;
|
||||||
|
@ -74,13 +75,18 @@ impl Rpc for AdminRpc {
|
||||||
|
|
||||||
pub struct AdminRpcHandler {
|
pub struct AdminRpcHandler {
|
||||||
garage: Arc<Garage>,
|
garage: Arc<Garage>,
|
||||||
|
background: Arc<BackgroundRunner>,
|
||||||
endpoint: Arc<Endpoint<AdminRpc, Self>>,
|
endpoint: Arc<Endpoint<AdminRpc, Self>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AdminRpcHandler {
|
impl AdminRpcHandler {
|
||||||
pub fn new(garage: Arc<Garage>) -> Arc<Self> {
|
pub fn new(garage: Arc<Garage>, background: Arc<BackgroundRunner>) -> Arc<Self> {
|
||||||
let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into());
|
let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into());
|
||||||
let admin = Arc::new(Self { garage, endpoint });
|
let admin = Arc::new(Self {
|
||||||
|
garage,
|
||||||
|
background,
|
||||||
|
endpoint,
|
||||||
|
});
|
||||||
admin.endpoint.set_handler(admin.clone());
|
admin.endpoint.set_handler(admin.clone());
|
||||||
admin
|
admin
|
||||||
}
|
}
|
||||||
|
@ -759,7 +765,7 @@ impl AdminRpcHandler {
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
launch_online_repair(self.garage.clone(), opt).await?;
|
launch_online_repair(&self.garage, &self.background, opt).await?;
|
||||||
Ok(AdminRpc::Ok(format!(
|
Ok(AdminRpc::Ok(format!(
|
||||||
"Repair launched on {:?}",
|
"Repair launched on {:?}",
|
||||||
self.garage.system.id
|
self.garage.system.id
|
||||||
|
@ -925,12 +931,11 @@ impl AdminRpcHandler {
|
||||||
async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> {
|
async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> {
|
||||||
match cmd {
|
match cmd {
|
||||||
WorkerOperation::List { opt } => {
|
WorkerOperation::List { opt } => {
|
||||||
let workers = self.garage.background.get_worker_info();
|
let workers = self.background.get_worker_info();
|
||||||
Ok(AdminRpc::WorkerList(workers, *opt))
|
Ok(AdminRpc::WorkerList(workers, *opt))
|
||||||
}
|
}
|
||||||
WorkerOperation::Info { tid } => {
|
WorkerOperation::Info { tid } => {
|
||||||
let info = self
|
let info = self
|
||||||
.garage
|
|
||||||
.background
|
.background
|
||||||
.get_worker_info()
|
.get_worker_info()
|
||||||
.get(tid)
|
.get(tid)
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use tokio::sync::watch;
|
|
||||||
|
|
||||||
use garage_util::background::*;
|
|
||||||
use garage_util::config::*;
|
use garage_util::config::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
|
|
||||||
|
@ -20,12 +17,8 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
|
||||||
info!("Loading configuration...");
|
info!("Loading configuration...");
|
||||||
let config = read_config(config_file)?;
|
let config = read_config(config_file)?;
|
||||||
|
|
||||||
info!("Initializing background runner...");
|
|
||||||
let (done_tx, done_rx) = watch::channel(false);
|
|
||||||
let (background, await_background_done) = BackgroundRunner::new(16, done_rx);
|
|
||||||
|
|
||||||
info!("Initializing Garage main data store...");
|
info!("Initializing Garage main data store...");
|
||||||
let garage = Garage::new(config.clone(), background)?;
|
let garage = Garage::new(config)?;
|
||||||
|
|
||||||
info!("Launching repair operation...");
|
info!("Launching repair operation...");
|
||||||
match opt.what {
|
match opt.what {
|
||||||
|
@ -43,13 +36,7 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Repair operation finished, shutting down Garage internals...");
|
info!("Repair operation finished, shutting down...");
|
||||||
done_tx.send(true).unwrap();
|
|
||||||
drop(garage);
|
|
||||||
|
|
||||||
await_background_done.await?;
|
|
||||||
|
|
||||||
info!("Cleaning up...");
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,11 @@ use garage_util::error::Error;
|
||||||
|
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
|
||||||
pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) -> Result<(), Error> {
|
pub async fn launch_online_repair(
|
||||||
|
garage: &Arc<Garage>,
|
||||||
|
bg: &BackgroundRunner,
|
||||||
|
opt: RepairOpt,
|
||||||
|
) -> Result<(), Error> {
|
||||||
match opt.what {
|
match opt.what {
|
||||||
RepairWhat::Tables => {
|
RepairWhat::Tables => {
|
||||||
info!("Launching a full sync of tables");
|
info!("Launching a full sync of tables");
|
||||||
|
@ -27,21 +31,15 @@ pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) -> Result
|
||||||
}
|
}
|
||||||
RepairWhat::Versions => {
|
RepairWhat::Versions => {
|
||||||
info!("Repairing the versions table");
|
info!("Repairing the versions table");
|
||||||
garage
|
bg.spawn_worker(RepairVersionsWorker::new(garage.clone()));
|
||||||
.background
|
|
||||||
.spawn_worker(RepairVersionsWorker::new(garage.clone()));
|
|
||||||
}
|
}
|
||||||
RepairWhat::BlockRefs => {
|
RepairWhat::BlockRefs => {
|
||||||
info!("Repairing the block refs table");
|
info!("Repairing the block refs table");
|
||||||
garage
|
bg.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
|
||||||
.background
|
|
||||||
.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
|
|
||||||
}
|
}
|
||||||
RepairWhat::Blocks => {
|
RepairWhat::Blocks => {
|
||||||
info!("Repairing the stored blocks");
|
info!("Repairing the stored blocks");
|
||||||
garage
|
bg.spawn_worker(garage_block::repair::RepairWorker::new(
|
||||||
.background
|
|
||||||
.spawn_worker(garage_block::repair::RepairWorker::new(
|
|
||||||
garage.block_manager.clone(),
|
garage.block_manager.clone(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,15 +35,15 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
|
||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
let metrics_exporter = opentelemetry_prometheus::exporter().init();
|
let metrics_exporter = opentelemetry_prometheus::exporter().init();
|
||||||
|
|
||||||
|
info!("Initializing Garage main data store...");
|
||||||
|
let garage = Garage::new(config.clone())?;
|
||||||
|
|
||||||
info!("Initializing background runner...");
|
info!("Initializing background runner...");
|
||||||
let watch_cancel = watch_shutdown_signal();
|
let watch_cancel = watch_shutdown_signal();
|
||||||
let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
|
let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
|
||||||
|
|
||||||
info!("Initializing Garage main data store...");
|
|
||||||
let garage = Garage::new(config.clone(), background)?;
|
|
||||||
|
|
||||||
info!("Spawning Garage workers...");
|
info!("Spawning Garage workers...");
|
||||||
garage.spawn_workers();
|
garage.spawn_workers(&background);
|
||||||
|
|
||||||
if config.admin.trace_sink.is_some() {
|
if config.admin.trace_sink.is_some() {
|
||||||
info!("Initialize tracing...");
|
info!("Initialize tracing...");
|
||||||
|
@ -66,7 +66,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
|
||||||
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
|
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
|
||||||
|
|
||||||
info!("Create admin RPC handler...");
|
info!("Create admin RPC handler...");
|
||||||
AdminRpcHandler::new(garage.clone());
|
AdminRpcHandler::new(garage.clone(), background.clone());
|
||||||
|
|
||||||
// ---- Launch public-facing API servers ----
|
// ---- Launch public-facing API servers ----
|
||||||
|
|
||||||
|
|
|
@ -39,8 +39,6 @@ pub struct Garage {
|
||||||
|
|
||||||
/// The local database
|
/// The local database
|
||||||
pub db: db::Db,
|
pub db: db::Db,
|
||||||
/// A background job runner
|
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
/// The membership manager
|
/// The membership manager
|
||||||
pub system: Arc<System>,
|
pub system: Arc<System>,
|
||||||
/// The block manager
|
/// The block manager
|
||||||
|
@ -78,7 +76,7 @@ pub struct GarageK2V {
|
||||||
|
|
||||||
impl Garage {
|
impl Garage {
|
||||||
/// Create and run garage
|
/// Create and run garage
|
||||||
pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> {
|
pub fn new(config: Config) -> Result<Arc<Self>, Error> {
|
||||||
// Create meta dir and data dir if they don't exist already
|
// Create meta dir and data dir if they don't exist already
|
||||||
std::fs::create_dir_all(&config.metadata_dir)
|
std::fs::create_dir_all(&config.metadata_dir)
|
||||||
.ok_or_message("Unable to create Garage metadata directory")?;
|
.ok_or_message("Unable to create Garage metadata directory")?;
|
||||||
|
@ -167,7 +165,7 @@ impl Garage {
|
||||||
.expect("Invalid replication_mode in config file.");
|
.expect("Invalid replication_mode in config file.");
|
||||||
|
|
||||||
info!("Initialize membership management system...");
|
info!("Initialize membership management system...");
|
||||||
let system = System::new(network_key, background.clone(), replication_mode, &config)?;
|
let system = System::new(network_key, replication_mode, &config)?;
|
||||||
|
|
||||||
let data_rep_param = TableShardedReplication {
|
let data_rep_param = TableShardedReplication {
|
||||||
system: system.clone(),
|
system: system.clone(),
|
||||||
|
@ -225,7 +223,6 @@ impl Garage {
|
||||||
info!("Initialize version_table...");
|
info!("Initialize version_table...");
|
||||||
let version_table = Table::new(
|
let version_table = Table::new(
|
||||||
VersionTable {
|
VersionTable {
|
||||||
background: background.clone(),
|
|
||||||
block_ref_table: block_ref_table.clone(),
|
block_ref_table: block_ref_table.clone(),
|
||||||
},
|
},
|
||||||
meta_rep_param.clone(),
|
meta_rep_param.clone(),
|
||||||
|
@ -240,7 +237,6 @@ impl Garage {
|
||||||
#[allow(clippy::redundant_clone)]
|
#[allow(clippy::redundant_clone)]
|
||||||
let object_table = Table::new(
|
let object_table = Table::new(
|
||||||
ObjectTable {
|
ObjectTable {
|
||||||
background: background.clone(),
|
|
||||||
version_table: version_table.clone(),
|
version_table: version_table.clone(),
|
||||||
object_counter_table: object_counter_table.clone(),
|
object_counter_table: object_counter_table.clone(),
|
||||||
},
|
},
|
||||||
|
@ -258,7 +254,6 @@ impl Garage {
|
||||||
config,
|
config,
|
||||||
replication_mode,
|
replication_mode,
|
||||||
db,
|
db,
|
||||||
background,
|
|
||||||
system,
|
system,
|
||||||
block_manager,
|
block_manager,
|
||||||
bucket_table,
|
bucket_table,
|
||||||
|
@ -273,20 +268,20 @@ impl Garage {
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_workers(&self) {
|
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
|
||||||
self.block_manager.spawn_workers();
|
self.block_manager.spawn_workers(bg);
|
||||||
|
|
||||||
self.bucket_table.spawn_workers();
|
self.bucket_table.spawn_workers(bg);
|
||||||
self.bucket_alias_table.spawn_workers();
|
self.bucket_alias_table.spawn_workers(bg);
|
||||||
self.key_table.spawn_workers();
|
self.key_table.spawn_workers(bg);
|
||||||
|
|
||||||
self.object_table.spawn_workers();
|
self.object_table.spawn_workers(bg);
|
||||||
self.object_counter_table.spawn_workers();
|
self.object_counter_table.spawn_workers(bg);
|
||||||
self.version_table.spawn_workers();
|
self.version_table.spawn_workers(bg);
|
||||||
self.block_ref_table.spawn_workers();
|
self.block_ref_table.spawn_workers(bg);
|
||||||
|
|
||||||
#[cfg(feature = "k2v")]
|
#[cfg(feature = "k2v")]
|
||||||
self.k2v.spawn_workers();
|
self.k2v.spawn_workers(bg);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
|
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
|
||||||
|
@ -324,8 +319,8 @@ impl GarageK2V {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_workers(&self) {
|
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
|
||||||
self.item_table.spawn_workers();
|
self.item_table.spawn_workers(bg);
|
||||||
self.counter_table.spawn_workers();
|
self.counter_table.spawn_workers(bg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,7 @@ use garage_db as db;
|
||||||
|
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
use garage_util::time::*;
|
use garage_util::time::*;
|
||||||
|
@ -164,8 +165,8 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_workers(&self) {
|
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
|
||||||
self.table.spawn_workers();
|
self.table.spawn_workers(bg);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count(
|
pub fn count(
|
||||||
|
|
|
@ -4,7 +4,6 @@ use std::sync::Arc;
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use garage_table::crdt::*;
|
use garage_table::crdt::*;
|
||||||
|
@ -221,7 +220,6 @@ impl Crdt for Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ObjectTable {
|
pub struct ObjectTable {
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
|
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
|
||||||
pub object_counter_table: Arc<IndexCounter<Object>>,
|
pub object_counter_table: Arc<IndexCounter<Object>>,
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,6 @@ use std::sync::Arc;
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use garage_table::crdt::*;
|
use garage_table::crdt::*;
|
||||||
|
@ -127,7 +126,6 @@ impl Crdt for Version {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct VersionTable {
|
pub struct VersionTable {
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
|
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ use std::time::Duration;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use futures::stream::futures_unordered::FuturesUnordered;
|
use futures::stream::futures_unordered::FuturesUnordered;
|
||||||
use futures::stream::StreamExt;
|
use futures::stream::StreamExt;
|
||||||
use futures_util::future::FutureExt;
|
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
|
|
||||||
|
@ -24,7 +23,6 @@ pub use netapp::message::{
|
||||||
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
||||||
pub use netapp::{self, NetApp, NodeID};
|
pub use netapp::{self, NetApp, NodeID};
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::Error;
|
||||||
use garage_util::metrics::RecordDuration;
|
use garage_util::metrics::RecordDuration;
|
||||||
|
@ -94,7 +92,6 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
|
||||||
struct RpcHelperInner {
|
struct RpcHelperInner {
|
||||||
our_node_id: Uuid,
|
our_node_id: Uuid,
|
||||||
fullmesh: Arc<FullMeshPeeringStrategy>,
|
fullmesh: Arc<FullMeshPeeringStrategy>,
|
||||||
background: Arc<BackgroundRunner>,
|
|
||||||
ring: watch::Receiver<Arc<Ring>>,
|
ring: watch::Receiver<Arc<Ring>>,
|
||||||
metrics: RpcMetrics,
|
metrics: RpcMetrics,
|
||||||
rpc_timeout: Duration,
|
rpc_timeout: Duration,
|
||||||
|
@ -104,7 +101,6 @@ impl RpcHelper {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
our_node_id: Uuid,
|
our_node_id: Uuid,
|
||||||
fullmesh: Arc<FullMeshPeeringStrategy>,
|
fullmesh: Arc<FullMeshPeeringStrategy>,
|
||||||
background: Arc<BackgroundRunner>,
|
|
||||||
ring: watch::Receiver<Arc<Ring>>,
|
ring: watch::Receiver<Arc<Ring>>,
|
||||||
rpc_timeout: Option<Duration>,
|
rpc_timeout: Option<Duration>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
|
@ -113,7 +109,6 @@ impl RpcHelper {
|
||||||
Self(Arc::new(RpcHelperInner {
|
Self(Arc::new(RpcHelperInner {
|
||||||
our_node_id,
|
our_node_id,
|
||||||
fullmesh,
|
fullmesh,
|
||||||
background,
|
|
||||||
ring,
|
ring,
|
||||||
metrics,
|
metrics,
|
||||||
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
|
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
|
||||||
|
@ -377,16 +372,13 @@ impl RpcHelper {
|
||||||
|
|
||||||
if !resp_stream.is_empty() {
|
if !resp_stream.is_empty() {
|
||||||
// Continue remaining requests in background.
|
// Continue remaining requests in background.
|
||||||
// Continue the remaining requests immediately using tokio::spawn
|
// Note: these requests can get interrupted on process shutdown,
|
||||||
// but enqueue a task in the background runner
|
// we must not count on them being executed for certain.
|
||||||
// to ensure that the process won't exit until the requests are done
|
// For all background things that have to happen with certainty,
|
||||||
// (if we had just enqueued the resp_stream.collect directly in the background runner,
|
// they have to be put in a proper queue that is persisted to disk.
|
||||||
// the requests might have been put on hold in the background runner's queue,
|
tokio::spawn(async move {
|
||||||
// in which case they might timeout or otherwise fail)
|
|
||||||
let wait_finished_fut = tokio::spawn(async move {
|
|
||||||
resp_stream.collect::<Vec<Result<_, _>>>().await;
|
resp_stream.collect::<Vec<Result<_, _>>>().await;
|
||||||
});
|
});
|
||||||
self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
||||||
use netapp::util::parse_and_resolve_peer_addr_async;
|
use netapp::util::parse_and_resolve_peer_addr_async;
|
||||||
use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
|
use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
use garage_util::background::{self};
|
||||||
use garage_util::config::Config;
|
use garage_util::config::Config;
|
||||||
#[cfg(feature = "kubernetes-discovery")]
|
#[cfg(feature = "kubernetes-discovery")]
|
||||||
use garage_util::config::KubernetesDiscoveryConfig;
|
use garage_util::config::KubernetesDiscoveryConfig;
|
||||||
|
@ -110,9 +110,6 @@ pub struct System {
|
||||||
pub ring: watch::Receiver<Arc<Ring>>,
|
pub ring: watch::Receiver<Arc<Ring>>,
|
||||||
update_ring: Mutex<watch::Sender<Arc<Ring>>>,
|
update_ring: Mutex<watch::Sender<Arc<Ring>>>,
|
||||||
|
|
||||||
/// The job runner of this node
|
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
|
|
||||||
/// Path to metadata directory
|
/// Path to metadata directory
|
||||||
pub metadata_dir: PathBuf,
|
pub metadata_dir: PathBuf,
|
||||||
}
|
}
|
||||||
|
@ -232,7 +229,6 @@ impl System {
|
||||||
/// Create this node's membership manager
|
/// Create this node's membership manager
|
||||||
pub fn new(
|
pub fn new(
|
||||||
network_key: NetworkKey,
|
network_key: NetworkKey,
|
||||||
background: Arc<BackgroundRunner>,
|
|
||||||
replication_mode: ReplicationMode,
|
replication_mode: ReplicationMode,
|
||||||
config: &Config,
|
config: &Config,
|
||||||
) -> Result<Arc<Self>, Error> {
|
) -> Result<Arc<Self>, Error> {
|
||||||
|
@ -354,7 +350,6 @@ impl System {
|
||||||
rpc: RpcHelper::new(
|
rpc: RpcHelper::new(
|
||||||
netapp.id.into(),
|
netapp.id.into(),
|
||||||
fullmesh,
|
fullmesh,
|
||||||
background.clone(),
|
|
||||||
ring.clone(),
|
ring.clone(),
|
||||||
config.rpc_timeout_msec.map(Duration::from_millis),
|
config.rpc_timeout_msec.map(Duration::from_millis),
|
||||||
),
|
),
|
||||||
|
@ -372,7 +367,6 @@ impl System {
|
||||||
|
|
||||||
ring,
|
ring,
|
||||||
update_ring: Mutex::new(update_ring),
|
update_ring: Mutex::new(update_ring),
|
||||||
background,
|
|
||||||
metadata_dir: config.metadata_dir.clone(),
|
metadata_dir: config.metadata_dir.clone(),
|
||||||
});
|
});
|
||||||
sys.system_endpoint.set_handler(sys.clone());
|
sys.system_endpoint.set_handler(sys.clone());
|
||||||
|
@ -578,7 +572,7 @@ impl System {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save network configuration to disc
|
/// Save network configuration to disc
|
||||||
async fn save_cluster_layout(self: Arc<Self>) -> Result<(), Error> {
|
async fn save_cluster_layout(&self) -> Result<(), Error> {
|
||||||
let ring: Arc<Ring> = self.ring.borrow().clone();
|
let ring: Arc<Ring> = self.ring.borrow().clone();
|
||||||
self.persist_cluster_layout
|
self.persist_cluster_layout
|
||||||
.save_async(&ring.layout)
|
.save_async(&ring.layout)
|
||||||
|
@ -631,7 +625,7 @@ impl System {
|
||||||
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
|
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
|
||||||
{
|
{
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
self.background.spawn_cancellable(async move {
|
background::spawn(async move {
|
||||||
self2.pull_cluster_layout(from).await;
|
self2.pull_cluster_layout(from).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
});
|
});
|
||||||
|
@ -676,7 +670,7 @@ impl System {
|
||||||
drop(update_ring);
|
drop(update_ring);
|
||||||
|
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
self.background.spawn_cancellable(async move {
|
background::spawn(async move {
|
||||||
self2
|
self2
|
||||||
.rpc
|
.rpc
|
||||||
.broadcast(
|
.broadcast(
|
||||||
|
@ -687,7 +681,8 @@ impl System {
|
||||||
.await?;
|
.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
});
|
});
|
||||||
self.background.spawn(self.clone().save_cluster_layout());
|
|
||||||
|
self.save_cluster_layout().await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(SystemRpc::Ok)
|
Ok(SystemRpc::Ok)
|
||||||
|
@ -773,7 +768,7 @@ impl System {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (node_id, node_addr) in ping_list {
|
for (node_id, node_addr) in ping_list {
|
||||||
tokio::spawn(
|
background::spawn(
|
||||||
self.netapp
|
self.netapp
|
||||||
.clone()
|
.clone()
|
||||||
.try_connect(node_addr, node_id)
|
.try_connect(node_addr, node_id)
|
||||||
|
@ -787,11 +782,10 @@ impl System {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "consul-discovery")]
|
#[cfg(feature = "consul-discovery")]
|
||||||
self.background.spawn(self.clone().advertise_to_consul());
|
background::spawn(self.clone().advertise_to_consul());
|
||||||
|
|
||||||
#[cfg(feature = "kubernetes-discovery")]
|
#[cfg(feature = "kubernetes-discovery")]
|
||||||
self.background
|
background::spawn(self.clone().advertise_to_kubernetes());
|
||||||
.spawn(self.clone().advertise_to_kubernetes());
|
|
||||||
|
|
||||||
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
|
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
|
||||||
select! {
|
select! {
|
||||||
|
|
|
@ -69,10 +69,8 @@ where
|
||||||
gc
|
gc
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn spawn_workers(self: &Arc<Self>) {
|
pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
self.system
|
bg.spawn_worker(GcWorker::new(self.clone()));
|
||||||
.background
|
|
||||||
.spawn_worker(GcWorker::new(self.clone()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
|
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
|
||||||
|
|
|
@ -87,12 +87,12 @@ where
|
||||||
syncer
|
syncer
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn spawn_workers(self: &Arc<Self>) {
|
pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
|
let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
|
||||||
self.add_full_sync_tx
|
self.add_full_sync_tx
|
||||||
.store(Some(Arc::new(add_full_sync_tx)));
|
.store(Some(Arc::new(add_full_sync_tx)));
|
||||||
|
|
||||||
self.system.background.spawn_worker(SyncWorker {
|
bg.spawn_worker(SyncWorker {
|
||||||
syncer: self.clone(),
|
syncer: self.clone(),
|
||||||
ring_recv: self.system.ring.clone(),
|
ring_recv: self.system.ring.clone(),
|
||||||
ring: self.system.ring.borrow().clone(),
|
ring: self.system.ring.borrow().clone(),
|
||||||
|
|
|
@ -14,6 +14,7 @@ use opentelemetry::{
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
|
use garage_util::background::{self, BackgroundRunner};
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::Error;
|
||||||
use garage_util::metrics::RecordDuration;
|
use garage_util::metrics::RecordDuration;
|
||||||
|
@ -96,13 +97,11 @@ where
|
||||||
table
|
table
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_workers(self: &Arc<Self>) {
|
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
self.merkle_updater.spawn_workers(&self.system.background);
|
self.merkle_updater.spawn_workers(bg);
|
||||||
self.syncer.spawn_workers();
|
self.syncer.spawn_workers(bg);
|
||||||
self.gc.spawn_workers();
|
self.gc.spawn_workers(bg);
|
||||||
self.system
|
bg.spawn_worker(InsertQueueWorker(self.clone()));
|
||||||
.background
|
|
||||||
.spawn_worker(InsertQueueWorker(self.clone()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
||||||
|
@ -276,9 +275,7 @@ where
|
||||||
if not_all_same {
|
if not_all_same {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
let ent2 = ret_entry.clone();
|
let ent2 = ret_entry.clone();
|
||||||
self.system
|
background::spawn(async move { self2.repair_on_read(&who[..], ent2).await });
|
||||||
.background
|
|
||||||
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -375,7 +372,7 @@ where
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|k| ret.get(&k).unwrap().clone())
|
.map(|k| ret.get(&k).unwrap().clone())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
self.system.background.spawn_cancellable(async move {
|
background::spawn(async move {
|
||||||
for v in to_repair {
|
for v in to_repair {
|
||||||
self2.repair_on_read(&who[..], v).await?;
|
self2.repair_on_read(&who[..], v).await?;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
//! Job worker: a generic worker that just processes incoming
|
|
||||||
//! jobs one by one
|
|
||||||
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use async_trait::async_trait;
|
|
||||||
use tokio::sync::{mpsc, Mutex};
|
|
||||||
|
|
||||||
use crate::background::worker::*;
|
|
||||||
use crate::background::*;
|
|
||||||
|
|
||||||
pub(crate) struct JobWorker {
|
|
||||||
pub(crate) index: usize,
|
|
||||||
pub(crate) job_chan: Arc<Mutex<mpsc::UnboundedReceiver<(Job, bool)>>>,
|
|
||||||
pub(crate) next_job: Option<Job>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Worker for JobWorker {
|
|
||||||
fn name(&self) -> String {
|
|
||||||
format!("Job worker #{}", self.index)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
|
||||||
match self.next_job.take() {
|
|
||||||
None => return Ok(WorkerState::Idle),
|
|
||||||
Some(job) => {
|
|
||||||
job.await?;
|
|
||||||
Ok(WorkerState::Busy)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
|
||||||
loop {
|
|
||||||
match self.job_chan.lock().await.recv().await {
|
|
||||||
Some((job, cancellable)) => {
|
|
||||||
if cancellable && *must_exit.borrow() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
self.next_job = Some(job);
|
|
||||||
return WorkerState::Busy;
|
|
||||||
}
|
|
||||||
None => return WorkerState::Done,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,27 +1,23 @@
|
||||||
//! Job runner for futures and async functions
|
//! Job runner for futures and async functions
|
||||||
|
|
||||||
pub mod job_worker;
|
|
||||||
pub mod worker;
|
pub mod worker;
|
||||||
|
|
||||||
use core::future::Future;
|
use core::future::Future;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::pin::Pin;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::sync::{mpsc, watch, Mutex};
|
use tokio::sync::{mpsc, watch};
|
||||||
|
|
||||||
use crate::error::Error;
|
use crate::error::Error;
|
||||||
use worker::WorkerProcessor;
|
use worker::WorkerProcessor;
|
||||||
pub use worker::{Worker, WorkerState};
|
pub use worker::{Worker, WorkerState};
|
||||||
|
|
||||||
pub(crate) type JobOutput = Result<(), Error>;
|
pub(crate) type JobOutput = Result<(), Error>;
|
||||||
pub(crate) type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
|
|
||||||
|
|
||||||
/// Job runner for futures and async functions
|
/// Job runner for futures and async functions
|
||||||
pub struct BackgroundRunner {
|
pub struct BackgroundRunner {
|
||||||
send_job: mpsc::UnboundedSender<(Job, bool)>,
|
|
||||||
send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
|
send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
|
||||||
worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
|
worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
|
||||||
}
|
}
|
||||||
|
@ -49,10 +45,7 @@ pub struct WorkerStatus {
|
||||||
|
|
||||||
impl BackgroundRunner {
|
impl BackgroundRunner {
|
||||||
/// Create a new BackgroundRunner
|
/// Create a new BackgroundRunner
|
||||||
pub fn new(
|
pub fn new(stop_signal: watch::Receiver<bool>) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
|
||||||
n_runners: usize,
|
|
||||||
stop_signal: watch::Receiver<bool>,
|
|
||||||
) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
|
|
||||||
let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
|
let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
|
||||||
|
|
||||||
let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
|
let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
|
||||||
|
@ -63,24 +56,7 @@ impl BackgroundRunner {
|
||||||
worker_processor.run().await;
|
worker_processor.run().await;
|
||||||
});
|
});
|
||||||
|
|
||||||
let (send_job, queue_out) = mpsc::unbounded_channel();
|
|
||||||
let queue_out = Arc::new(Mutex::new(queue_out));
|
|
||||||
|
|
||||||
for i in 0..n_runners {
|
|
||||||
let queue_out = queue_out.clone();
|
|
||||||
|
|
||||||
send_worker
|
|
||||||
.send(Box::new(job_worker::JobWorker {
|
|
||||||
index: i,
|
|
||||||
job_chan: queue_out.clone(),
|
|
||||||
next_job: None,
|
|
||||||
}))
|
|
||||||
.ok()
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let bgrunner = Arc::new(Self {
|
let bgrunner = Arc::new(Self {
|
||||||
send_job,
|
|
||||||
send_worker,
|
send_worker,
|
||||||
worker_info,
|
worker_info,
|
||||||
});
|
});
|
||||||
|
@ -91,31 +67,6 @@ impl BackgroundRunner {
|
||||||
self.worker_info.lock().unwrap().clone()
|
self.worker_info.lock().unwrap().clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawn a task to be run in background
|
|
||||||
pub fn spawn<T>(&self, job: T)
|
|
||||||
where
|
|
||||||
T: Future<Output = JobOutput> + Send + 'static,
|
|
||||||
{
|
|
||||||
let boxed: Job = Box::pin(job);
|
|
||||||
self.send_job
|
|
||||||
.send((boxed, false))
|
|
||||||
.ok()
|
|
||||||
.expect("Could not put job in queue");
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Spawn a task to be run in background. It may get discarded before running if spawned while
|
|
||||||
/// the runner is stopping
|
|
||||||
pub fn spawn_cancellable<T>(&self, job: T)
|
|
||||||
where
|
|
||||||
T: Future<Output = JobOutput> + Send + 'static,
|
|
||||||
{
|
|
||||||
let boxed: Job = Box::pin(job);
|
|
||||||
self.send_job
|
|
||||||
.send((boxed, true))
|
|
||||||
.ok()
|
|
||||||
.expect("Could not put job in queue");
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn spawn_worker<W>(&self, worker: W)
|
pub fn spawn_worker<W>(&self, worker: W)
|
||||||
where
|
where
|
||||||
W: Worker + 'static,
|
W: Worker + 'static,
|
||||||
|
@ -126,3 +77,14 @@ impl BackgroundRunner {
|
||||||
.expect("Could not put worker in queue");
|
.expect("Could not put worker in queue");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn spawn<T>(job: T)
|
||||||
|
where
|
||||||
|
T: Future<Output = JobOutput> + Send + 'static,
|
||||||
|
{
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = job.await {
|
||||||
|
error!("{}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue