forked from Deuxfleurs/garage
Merge pull request 'Some improvements to Garage internals' (#451) from internals-rework into main
Reviewed-on: Deuxfleurs/garage#451
This commit is contained in:
commit
582b076179
27 changed files with 442 additions and 559 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1243,6 +1243,7 @@ dependencies = [
|
||||||
name = "garage_table"
|
name = "garage_table"
|
||||||
version = "0.8.1"
|
version = "0.8.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"arc-swap",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures",
|
"futures",
|
||||||
|
|
|
@ -32,7 +32,7 @@ args@{
|
||||||
ignoreLockHash,
|
ignoreLockHash,
|
||||||
}:
|
}:
|
||||||
let
|
let
|
||||||
nixifiedLockHash = "463114c4544bfa9b442a43afc6b39eb588f5720825c7a246ba9188c4bdb52944";
|
nixifiedLockHash = "4639f63ff4c54c01f66ec3d0d362f6905456dd768d6e94df1a7367c763721fd7";
|
||||||
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
||||||
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
||||||
lockHashIgnored = if ignoreLockHash
|
lockHashIgnored = if ignoreLockHash
|
||||||
|
@ -1769,6 +1769,7 @@ in
|
||||||
registry = "unknown";
|
registry = "unknown";
|
||||||
src = fetchCrateLocal (workspaceSrc + "/src/table");
|
src = fetchCrateLocal (workspaceSrc + "/src/table");
|
||||||
dependencies = {
|
dependencies = {
|
||||||
|
arc_swap = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".arc-swap."1.5.0" { inherit profileName; }).out;
|
||||||
async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.52" { profileName = "__noProfile"; }).out;
|
async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.52" { profileName = "__noProfile"; }).out;
|
||||||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.2.0" { inherit profileName; }).out;
|
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.2.0" { inherit profileName; }).out;
|
||||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.21" { inherit profileName; }).out;
|
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.21" { inherit profileName; }).out;
|
||||||
|
|
|
@ -3,6 +3,7 @@ use std::pin::Pin;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use arc_swap::ArcSwapOption;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
@ -22,6 +23,7 @@ use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream};
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
use garage_util::metrics::RecordDuration;
|
use garage_util::metrics::RecordDuration;
|
||||||
|
@ -87,7 +89,7 @@ pub struct BlockManager {
|
||||||
|
|
||||||
pub(crate) metrics: BlockManagerMetrics,
|
pub(crate) metrics: BlockManagerMetrics,
|
||||||
|
|
||||||
tx_scrub_command: mpsc::Sender<ScrubWorkerCommand>,
|
tx_scrub_command: ArcSwapOption<mpsc::Sender<ScrubWorkerCommand>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
|
@ -126,8 +128,6 @@ impl BlockManager {
|
||||||
let metrics =
|
let metrics =
|
||||||
BlockManagerMetrics::new(rc.rc.clone(), resync.queue.clone(), resync.errors.clone());
|
BlockManagerMetrics::new(rc.rc.clone(), resync.queue.clone(), resync.errors.clone());
|
||||||
|
|
||||||
let (scrub_tx, scrub_rx) = mpsc::channel(1);
|
|
||||||
|
|
||||||
let block_manager = Arc::new(Self {
|
let block_manager = Arc::new(Self {
|
||||||
replication,
|
replication,
|
||||||
data_dir,
|
data_dir,
|
||||||
|
@ -138,21 +138,24 @@ impl BlockManager {
|
||||||
system,
|
system,
|
||||||
endpoint,
|
endpoint,
|
||||||
metrics,
|
metrics,
|
||||||
tx_scrub_command: scrub_tx,
|
tx_scrub_command: ArcSwapOption::new(None),
|
||||||
});
|
});
|
||||||
block_manager.endpoint.set_handler(block_manager.clone());
|
block_manager.endpoint.set_handler(block_manager.clone());
|
||||||
|
|
||||||
|
block_manager
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
// Spawn a bunch of resync workers
|
// Spawn a bunch of resync workers
|
||||||
for index in 0..MAX_RESYNC_WORKERS {
|
for index in 0..MAX_RESYNC_WORKERS {
|
||||||
let worker = ResyncWorker::new(index, block_manager.clone());
|
let worker = ResyncWorker::new(index, self.clone());
|
||||||
block_manager.system.background.spawn_worker(worker);
|
bg.spawn_worker(worker);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Spawn scrub worker
|
// Spawn scrub worker
|
||||||
let scrub_worker = ScrubWorker::new(block_manager.clone(), scrub_rx);
|
let (scrub_tx, scrub_rx) = mpsc::channel(1);
|
||||||
block_manager.system.background.spawn_worker(scrub_worker);
|
self.tx_scrub_command.store(Some(Arc::new(scrub_tx)));
|
||||||
|
bg.spawn_worker(ScrubWorker::new(self.clone(), scrub_rx));
|
||||||
block_manager
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ask nodes that might have a (possibly compressed) block for it
|
/// Ask nodes that might have a (possibly compressed) block for it
|
||||||
|
@ -325,8 +328,11 @@ impl BlockManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Send command to start/stop/manager scrub worker
|
/// Send command to start/stop/manager scrub worker
|
||||||
pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) {
|
pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) -> Result<(), Error> {
|
||||||
let _ = self.tx_scrub_command.send(cmd).await;
|
let tx = self.tx_scrub_command.load();
|
||||||
|
let tx = tx.as_ref().ok_or_message("scrub worker is not running")?;
|
||||||
|
tx.send(cmd).await.ok_or_message("send error")?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the reference count of a block
|
/// Get the reference count of a block
|
||||||
|
|
|
@ -148,7 +148,7 @@ impl Worker for RepairWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -341,7 +341,7 @@ impl Worker for ScrubWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
let (wait_until, command) = match &self.work {
|
let (wait_until, command) = match &self.work {
|
||||||
ScrubWorkerState::Running(_) => return WorkerState::Busy,
|
ScrubWorkerState::Running(_) => return WorkerState::Busy,
|
||||||
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
|
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
|
||||||
|
|
|
@ -540,7 +540,7 @@ impl Worker for ResyncWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
while self.index >= self.manager.resync.persisted.load().n_workers {
|
while self.index >= self.manager.resync.persisted.load().n_workers {
|
||||||
self.manager.resync.notify.notified().await
|
self.manager.resync.notify.notified().await
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ use std::sync::Arc;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::crdt::*;
|
use garage_util::crdt::*;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error as GarageError;
|
use garage_util::error::Error as GarageError;
|
||||||
|
@ -74,13 +75,18 @@ impl Rpc for AdminRpc {
|
||||||
|
|
||||||
pub struct AdminRpcHandler {
|
pub struct AdminRpcHandler {
|
||||||
garage: Arc<Garage>,
|
garage: Arc<Garage>,
|
||||||
|
background: Arc<BackgroundRunner>,
|
||||||
endpoint: Arc<Endpoint<AdminRpc, Self>>,
|
endpoint: Arc<Endpoint<AdminRpc, Self>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AdminRpcHandler {
|
impl AdminRpcHandler {
|
||||||
pub fn new(garage: Arc<Garage>) -> Arc<Self> {
|
pub fn new(garage: Arc<Garage>, background: Arc<BackgroundRunner>) -> Arc<Self> {
|
||||||
let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into());
|
let endpoint = garage.system.netapp.endpoint(ADMIN_RPC_PATH.into());
|
||||||
let admin = Arc::new(Self { garage, endpoint });
|
let admin = Arc::new(Self {
|
||||||
|
garage,
|
||||||
|
background,
|
||||||
|
endpoint,
|
||||||
|
});
|
||||||
admin.endpoint.set_handler(admin.clone());
|
admin.endpoint.set_handler(admin.clone());
|
||||||
admin
|
admin
|
||||||
}
|
}
|
||||||
|
@ -759,7 +765,7 @@ impl AdminRpcHandler {
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
launch_online_repair(self.garage.clone(), opt).await;
|
launch_online_repair(&self.garage, &self.background, opt).await?;
|
||||||
Ok(AdminRpc::Ok(format!(
|
Ok(AdminRpc::Ok(format!(
|
||||||
"Repair launched on {:?}",
|
"Repair launched on {:?}",
|
||||||
self.garage.system.id
|
self.garage.system.id
|
||||||
|
@ -925,12 +931,11 @@ impl AdminRpcHandler {
|
||||||
async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> {
|
async fn handle_worker_cmd(&self, cmd: &WorkerOperation) -> Result<AdminRpc, Error> {
|
||||||
match cmd {
|
match cmd {
|
||||||
WorkerOperation::List { opt } => {
|
WorkerOperation::List { opt } => {
|
||||||
let workers = self.garage.background.get_worker_info();
|
let workers = self.background.get_worker_info();
|
||||||
Ok(AdminRpc::WorkerList(workers, *opt))
|
Ok(AdminRpc::WorkerList(workers, *opt))
|
||||||
}
|
}
|
||||||
WorkerOperation::Info { tid } => {
|
WorkerOperation::Info { tid } => {
|
||||||
let info = self
|
let info = self
|
||||||
.garage
|
|
||||||
.background
|
.background
|
||||||
.get_worker_info()
|
.get_worker_info()
|
||||||
.get(tid)
|
.get(tid)
|
||||||
|
@ -944,7 +949,7 @@ impl AdminRpcHandler {
|
||||||
self.garage
|
self.garage
|
||||||
.block_manager
|
.block_manager
|
||||||
.send_scrub_command(scrub_command)
|
.send_scrub_command(scrub_command)
|
||||||
.await;
|
.await?;
|
||||||
Ok(AdminRpc::Ok("Scrub tranquility updated".into()))
|
Ok(AdminRpc::Ok("Scrub tranquility updated".into()))
|
||||||
}
|
}
|
||||||
WorkerSetCmd::ResyncWorkerCount { worker_count } => {
|
WorkerSetCmd::ResyncWorkerCount { worker_count } => {
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use tokio::sync::watch;
|
|
||||||
|
|
||||||
use garage_util::background::*;
|
|
||||||
use garage_util::config::*;
|
use garage_util::config::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
|
|
||||||
|
@ -20,12 +17,8 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
|
||||||
info!("Loading configuration...");
|
info!("Loading configuration...");
|
||||||
let config = read_config(config_file)?;
|
let config = read_config(config_file)?;
|
||||||
|
|
||||||
info!("Initializing background runner...");
|
|
||||||
let (done_tx, done_rx) = watch::channel(false);
|
|
||||||
let (background, await_background_done) = BackgroundRunner::new(16, done_rx);
|
|
||||||
|
|
||||||
info!("Initializing Garage main data store...");
|
info!("Initializing Garage main data store...");
|
||||||
let garage = Garage::new(config.clone(), background)?;
|
let garage = Garage::new(config)?;
|
||||||
|
|
||||||
info!("Launching repair operation...");
|
info!("Launching repair operation...");
|
||||||
match opt.what {
|
match opt.what {
|
||||||
|
@ -43,13 +36,7 @@ pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Resu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Repair operation finished, shutting down Garage internals...");
|
info!("Repair operation finished, shutting down...");
|
||||||
done_tx.send(true).unwrap();
|
|
||||||
drop(garage);
|
|
||||||
|
|
||||||
await_background_done.await?;
|
|
||||||
|
|
||||||
info!("Cleaning up...");
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,35 +15,33 @@ use garage_util::error::Error;
|
||||||
|
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
|
||||||
pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) {
|
pub async fn launch_online_repair(
|
||||||
|
garage: &Arc<Garage>,
|
||||||
|
bg: &BackgroundRunner,
|
||||||
|
opt: RepairOpt,
|
||||||
|
) -> Result<(), Error> {
|
||||||
match opt.what {
|
match opt.what {
|
||||||
RepairWhat::Tables => {
|
RepairWhat::Tables => {
|
||||||
info!("Launching a full sync of tables");
|
info!("Launching a full sync of tables");
|
||||||
garage.bucket_table.syncer.add_full_sync();
|
garage.bucket_table.syncer.add_full_sync()?;
|
||||||
garage.object_table.syncer.add_full_sync();
|
garage.object_table.syncer.add_full_sync()?;
|
||||||
garage.version_table.syncer.add_full_sync();
|
garage.version_table.syncer.add_full_sync()?;
|
||||||
garage.block_ref_table.syncer.add_full_sync();
|
garage.block_ref_table.syncer.add_full_sync()?;
|
||||||
garage.key_table.syncer.add_full_sync();
|
garage.key_table.syncer.add_full_sync()?;
|
||||||
}
|
}
|
||||||
RepairWhat::Versions => {
|
RepairWhat::Versions => {
|
||||||
info!("Repairing the versions table");
|
info!("Repairing the versions table");
|
||||||
garage
|
bg.spawn_worker(RepairVersionsWorker::new(garage.clone()));
|
||||||
.background
|
|
||||||
.spawn_worker(RepairVersionsWorker::new(garage.clone()));
|
|
||||||
}
|
}
|
||||||
RepairWhat::BlockRefs => {
|
RepairWhat::BlockRefs => {
|
||||||
info!("Repairing the block refs table");
|
info!("Repairing the block refs table");
|
||||||
garage
|
bg.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
|
||||||
.background
|
|
||||||
.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
|
|
||||||
}
|
}
|
||||||
RepairWhat::Blocks => {
|
RepairWhat::Blocks => {
|
||||||
info!("Repairing the stored blocks");
|
info!("Repairing the stored blocks");
|
||||||
garage
|
bg.spawn_worker(garage_block::repair::RepairWorker::new(
|
||||||
.background
|
garage.block_manager.clone(),
|
||||||
.spawn_worker(garage_block::repair::RepairWorker::new(
|
));
|
||||||
garage.block_manager.clone(),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
RepairWhat::Scrub { cmd } => {
|
RepairWhat::Scrub { cmd } => {
|
||||||
let cmd = match cmd {
|
let cmd = match cmd {
|
||||||
|
@ -56,9 +54,10 @@ pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
info!("Sending command to scrub worker: {:?}", cmd);
|
info!("Sending command to scrub worker: {:?}", cmd);
|
||||||
garage.block_manager.send_scrub_command(cmd).await;
|
garage.block_manager.send_scrub_command(cmd).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----
|
// ----
|
||||||
|
@ -93,19 +92,14 @@ impl Worker for RepairVersionsWorker {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
||||||
let item_bytes = match self.garage.version_table.data.store.get_gt(&self.pos)? {
|
let (item_bytes, next_pos) = match self.garage.version_table.data.store.get_gt(&self.pos)? {
|
||||||
Some((k, v)) => {
|
Some((k, v)) => (v, k),
|
||||||
self.pos = k;
|
|
||||||
v
|
|
||||||
}
|
|
||||||
None => {
|
None => {
|
||||||
info!("repair_versions: finished, done {}", self.counter);
|
info!("repair_versions: finished, done {}", self.counter);
|
||||||
return Ok(WorkerState::Done);
|
return Ok(WorkerState::Done);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
self.counter += 1;
|
|
||||||
|
|
||||||
let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?;
|
let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?;
|
||||||
if !version.deleted.get() {
|
if !version.deleted.get() {
|
||||||
let object = self
|
let object = self
|
||||||
|
@ -134,10 +128,13 @@ impl Worker for RepairVersionsWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.counter += 1;
|
||||||
|
self.pos = next_pos;
|
||||||
|
|
||||||
Ok(WorkerState::Busy)
|
Ok(WorkerState::Busy)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -174,18 +171,14 @@ impl Worker for RepairBlockrefsWorker {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
||||||
let item_bytes = match self.garage.block_ref_table.data.store.get_gt(&self.pos)? {
|
let (item_bytes, next_pos) =
|
||||||
Some((k, v)) => {
|
match self.garage.block_ref_table.data.store.get_gt(&self.pos)? {
|
||||||
self.pos = k;
|
Some((k, v)) => (v, k),
|
||||||
v
|
None => {
|
||||||
}
|
info!("repair_block_ref: finished, done {}", self.counter);
|
||||||
None => {
|
return Ok(WorkerState::Done);
|
||||||
info!("repair_block_ref: finished, done {}", self.counter);
|
}
|
||||||
return Ok(WorkerState::Done);
|
};
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
self.counter += 1;
|
|
||||||
|
|
||||||
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?;
|
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?;
|
||||||
if !block_ref.deleted.get() {
|
if !block_ref.deleted.get() {
|
||||||
|
@ -212,10 +205,13 @@ impl Worker for RepairBlockrefsWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.counter += 1;
|
||||||
|
self.pos = next_pos;
|
||||||
|
|
||||||
Ok(WorkerState::Busy)
|
Ok(WorkerState::Busy)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,12 +35,15 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
|
||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
let metrics_exporter = opentelemetry_prometheus::exporter().init();
|
let metrics_exporter = opentelemetry_prometheus::exporter().init();
|
||||||
|
|
||||||
|
info!("Initializing Garage main data store...");
|
||||||
|
let garage = Garage::new(config.clone())?;
|
||||||
|
|
||||||
info!("Initializing background runner...");
|
info!("Initializing background runner...");
|
||||||
let watch_cancel = watch_shutdown_signal();
|
let watch_cancel = watch_shutdown_signal();
|
||||||
let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
|
let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
|
||||||
|
|
||||||
info!("Initializing Garage main data store...");
|
info!("Spawning Garage workers...");
|
||||||
let garage = Garage::new(config.clone(), background)?;
|
garage.spawn_workers(&background);
|
||||||
|
|
||||||
if config.admin.trace_sink.is_some() {
|
if config.admin.trace_sink.is_some() {
|
||||||
info!("Initialize tracing...");
|
info!("Initialize tracing...");
|
||||||
|
@ -63,7 +66,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
|
||||||
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
|
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
|
||||||
|
|
||||||
info!("Create admin RPC handler...");
|
info!("Create admin RPC handler...");
|
||||||
AdminRpcHandler::new(garage.clone());
|
AdminRpcHandler::new(garage.clone(), background.clone());
|
||||||
|
|
||||||
// ---- Launch public-facing API servers ----
|
// ---- Launch public-facing API servers ----
|
||||||
|
|
||||||
|
|
|
@ -39,8 +39,6 @@ pub struct Garage {
|
||||||
|
|
||||||
/// The local database
|
/// The local database
|
||||||
pub db: db::Db,
|
pub db: db::Db,
|
||||||
/// A background job runner
|
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
/// The membership manager
|
/// The membership manager
|
||||||
pub system: Arc<System>,
|
pub system: Arc<System>,
|
||||||
/// The block manager
|
/// The block manager
|
||||||
|
@ -78,7 +76,7 @@ pub struct GarageK2V {
|
||||||
|
|
||||||
impl Garage {
|
impl Garage {
|
||||||
/// Create and run garage
|
/// Create and run garage
|
||||||
pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> {
|
pub fn new(config: Config) -> Result<Arc<Self>, Error> {
|
||||||
// Create meta dir and data dir if they don't exist already
|
// Create meta dir and data dir if they don't exist already
|
||||||
std::fs::create_dir_all(&config.metadata_dir)
|
std::fs::create_dir_all(&config.metadata_dir)
|
||||||
.ok_or_message("Unable to create Garage metadata directory")?;
|
.ok_or_message("Unable to create Garage metadata directory")?;
|
||||||
|
@ -167,7 +165,7 @@ impl Garage {
|
||||||
.expect("Invalid replication_mode in config file.");
|
.expect("Invalid replication_mode in config file.");
|
||||||
|
|
||||||
info!("Initialize membership management system...");
|
info!("Initialize membership management system...");
|
||||||
let system = System::new(network_key, background.clone(), replication_mode, &config)?;
|
let system = System::new(network_key, replication_mode, &config)?;
|
||||||
|
|
||||||
let data_rep_param = TableShardedReplication {
|
let data_rep_param = TableShardedReplication {
|
||||||
system: system.clone(),
|
system: system.clone(),
|
||||||
|
@ -225,7 +223,6 @@ impl Garage {
|
||||||
info!("Initialize version_table...");
|
info!("Initialize version_table...");
|
||||||
let version_table = Table::new(
|
let version_table = Table::new(
|
||||||
VersionTable {
|
VersionTable {
|
||||||
background: background.clone(),
|
|
||||||
block_ref_table: block_ref_table.clone(),
|
block_ref_table: block_ref_table.clone(),
|
||||||
},
|
},
|
||||||
meta_rep_param.clone(),
|
meta_rep_param.clone(),
|
||||||
|
@ -240,7 +237,6 @@ impl Garage {
|
||||||
#[allow(clippy::redundant_clone)]
|
#[allow(clippy::redundant_clone)]
|
||||||
let object_table = Table::new(
|
let object_table = Table::new(
|
||||||
ObjectTable {
|
ObjectTable {
|
||||||
background: background.clone(),
|
|
||||||
version_table: version_table.clone(),
|
version_table: version_table.clone(),
|
||||||
object_counter_table: object_counter_table.clone(),
|
object_counter_table: object_counter_table.clone(),
|
||||||
},
|
},
|
||||||
|
@ -258,7 +254,6 @@ impl Garage {
|
||||||
config,
|
config,
|
||||||
replication_mode,
|
replication_mode,
|
||||||
db,
|
db,
|
||||||
background,
|
|
||||||
system,
|
system,
|
||||||
block_manager,
|
block_manager,
|
||||||
bucket_table,
|
bucket_table,
|
||||||
|
@ -273,6 +268,22 @@ impl Garage {
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
|
||||||
|
self.block_manager.spawn_workers(bg);
|
||||||
|
|
||||||
|
self.bucket_table.spawn_workers(bg);
|
||||||
|
self.bucket_alias_table.spawn_workers(bg);
|
||||||
|
self.key_table.spawn_workers(bg);
|
||||||
|
|
||||||
|
self.object_table.spawn_workers(bg);
|
||||||
|
self.object_counter_table.spawn_workers(bg);
|
||||||
|
self.version_table.spawn_workers(bg);
|
||||||
|
self.block_ref_table.spawn_workers(bg);
|
||||||
|
|
||||||
|
#[cfg(feature = "k2v")]
|
||||||
|
self.k2v.spawn_workers(bg);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
|
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
|
||||||
helper::bucket::BucketHelper(self)
|
helper::bucket::BucketHelper(self)
|
||||||
}
|
}
|
||||||
|
@ -307,4 +318,9 @@ impl GarageK2V {
|
||||||
rpc,
|
rpc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
|
||||||
|
self.item_table.spawn_workers(bg);
|
||||||
|
self.counter_table.spawn_workers(bg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +1,15 @@
|
||||||
use core::ops::Bound;
|
use core::ops::Bound;
|
||||||
use std::collections::{hash_map, BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::sync::{mpsc, watch};
|
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
use garage_util::background::*;
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::*;
|
use garage_util::error::*;
|
||||||
use garage_util::time::*;
|
use garage_util::time::*;
|
||||||
|
@ -142,7 +140,6 @@ impl<T: CountedItem> TableSchema for CounterTable<T> {
|
||||||
pub struct IndexCounter<T: CountedItem> {
|
pub struct IndexCounter<T: CountedItem> {
|
||||||
this_node: Uuid,
|
this_node: Uuid,
|
||||||
local_counter: db::Tree,
|
local_counter: db::Tree,
|
||||||
propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry<T>)>,
|
|
||||||
pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>,
|
pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,16 +149,11 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
replication: TableShardedReplication,
|
replication: TableShardedReplication,
|
||||||
db: &db::Db,
|
db: &db::Db,
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
let background = system.background.clone();
|
Arc::new(Self {
|
||||||
|
|
||||||
let (propagate_tx, propagate_rx) = mpsc::unbounded_channel();
|
|
||||||
|
|
||||||
let this = Arc::new(Self {
|
|
||||||
this_node: system.id,
|
this_node: system.id,
|
||||||
local_counter: db
|
local_counter: db
|
||||||
.open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME))
|
.open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME))
|
||||||
.expect("Unable to open local counter tree"),
|
.expect("Unable to open local counter tree"),
|
||||||
propagate_tx,
|
|
||||||
table: Table::new(
|
table: Table::new(
|
||||||
CounterTable {
|
CounterTable {
|
||||||
_phantom_t: Default::default(),
|
_phantom_t: Default::default(),
|
||||||
|
@ -170,16 +162,11 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
system,
|
system,
|
||||||
db,
|
db,
|
||||||
),
|
),
|
||||||
});
|
})
|
||||||
|
}
|
||||||
|
|
||||||
background.spawn_worker(IndexPropagatorWorker {
|
pub fn spawn_workers(&self, bg: &BackgroundRunner) {
|
||||||
index_counter: this.clone(),
|
self.table.spawn_workers(bg);
|
||||||
propagate_rx,
|
|
||||||
buf: HashMap::new(),
|
|
||||||
errors: 0,
|
|
||||||
});
|
|
||||||
|
|
||||||
this
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count(
|
pub fn count(
|
||||||
|
@ -232,12 +219,8 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
.map_err(db::TxError::Abort)?;
|
.map_err(db::TxError::Abort)?;
|
||||||
tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?;
|
tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?;
|
||||||
|
|
||||||
if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) {
|
let dist_entry = entry.into_counter_entry(self.this_node);
|
||||||
error!(
|
self.table.queue_insert(tx, &dist_entry)?;
|
||||||
"Could not propagate updated counter values, failed to send to channel: {}",
|
|
||||||
e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -250,23 +233,6 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
TS: TableSchema<E = T>,
|
TS: TableSchema<E = T>,
|
||||||
TR: TableReplication,
|
TR: TableReplication,
|
||||||
{
|
{
|
||||||
let save_counter_entry = |entry: CounterEntry<T>| -> Result<(), Error> {
|
|
||||||
let entry_k = self
|
|
||||||
.table
|
|
||||||
.data
|
|
||||||
.tree_key(entry.partition_key(), entry.sort_key());
|
|
||||||
self.table
|
|
||||||
.data
|
|
||||||
.update_entry_with(&entry_k, |ent| match ent {
|
|
||||||
Some(mut ent) => {
|
|
||||||
ent.merge(&entry);
|
|
||||||
ent
|
|
||||||
}
|
|
||||||
None => entry.clone(),
|
|
||||||
})?;
|
|
||||||
Ok(())
|
|
||||||
};
|
|
||||||
|
|
||||||
// 1. Set all old local counters to zero
|
// 1. Set all old local counters to zero
|
||||||
let now = now_msec();
|
let now = now_msec();
|
||||||
let mut next_start: Option<Vec<u8>> = None;
|
let mut next_start: Option<Vec<u8>> = None;
|
||||||
|
@ -302,7 +268,9 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
.insert(&local_counter_k, &local_counter_bytes)?;
|
.insert(&local_counter_k, &local_counter_bytes)?;
|
||||||
|
|
||||||
let counter_entry = local_counter.into_counter_entry(self.this_node);
|
let counter_entry = local_counter.into_counter_entry(self.this_node);
|
||||||
save_counter_entry(counter_entry)?;
|
self.local_counter
|
||||||
|
.db()
|
||||||
|
.transaction(|mut tx| self.table.queue_insert(&mut tx, &counter_entry))?;
|
||||||
|
|
||||||
next_start = Some(local_counter_k);
|
next_start = Some(local_counter_k);
|
||||||
}
|
}
|
||||||
|
@ -367,7 +335,9 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
.insert(&local_counter_key, local_counter_bytes)?;
|
.insert(&local_counter_key, local_counter_bytes)?;
|
||||||
|
|
||||||
let counter_entry = local_counter.into_counter_entry(self.this_node);
|
let counter_entry = local_counter.into_counter_entry(self.this_node);
|
||||||
save_counter_entry(counter_entry)?;
|
self.local_counter
|
||||||
|
.db()
|
||||||
|
.transaction(|mut tx| self.table.queue_insert(&mut tx, &counter_entry))?;
|
||||||
|
|
||||||
next_start = Some(counted_entry_k);
|
next_start = Some(counted_entry_k);
|
||||||
}
|
}
|
||||||
|
@ -378,96 +348,7 @@ impl<T: CountedItem> IndexCounter<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct IndexPropagatorWorker<T: CountedItem> {
|
// ----
|
||||||
index_counter: Arc<IndexCounter<T>>,
|
|
||||||
propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry<T>)>,
|
|
||||||
|
|
||||||
buf: HashMap<Vec<u8>, CounterEntry<T>>,
|
|
||||||
errors: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: CountedItem> IndexPropagatorWorker<T> {
|
|
||||||
fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry<T>) {
|
|
||||||
let tree_key = self.index_counter.table.data.tree_key(&pk, &sk);
|
|
||||||
let dist_entry = counters.into_counter_entry(self.index_counter.this_node);
|
|
||||||
match self.buf.entry(tree_key) {
|
|
||||||
hash_map::Entry::Vacant(e) => {
|
|
||||||
e.insert(dist_entry);
|
|
||||||
}
|
|
||||||
hash_map::Entry::Occupied(mut e) => {
|
|
||||||
e.get_mut().merge(&dist_entry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl<T: CountedItem> Worker for IndexPropagatorWorker<T> {
|
|
||||||
fn name(&self) -> String {
|
|
||||||
format!("{} counter", T::COUNTER_TABLE_NAME)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn status(&self) -> WorkerStatus {
|
|
||||||
WorkerStatus {
|
|
||||||
queue_length: Some(self.buf.len() as u64),
|
|
||||||
..Default::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
|
||||||
// This loop batches updates to counters to be sent all at once.
|
|
||||||
// They are sent once the propagate_rx channel has been emptied (or is closed).
|
|
||||||
let closed = loop {
|
|
||||||
match self.propagate_rx.try_recv() {
|
|
||||||
Ok((pk, sk, counters)) => {
|
|
||||||
self.add_ent(pk, sk, counters);
|
|
||||||
}
|
|
||||||
Err(mpsc::error::TryRecvError::Empty) => break false,
|
|
||||||
Err(mpsc::error::TryRecvError::Disconnected) => break true,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if !self.buf.is_empty() {
|
|
||||||
let entries_k = self.buf.keys().take(100).cloned().collect::<Vec<_>>();
|
|
||||||
let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap());
|
|
||||||
if let Err(e) = self.index_counter.table.insert_many(entries).await {
|
|
||||||
self.errors += 1;
|
|
||||||
if self.errors >= 2 && *must_exit.borrow() {
|
|
||||||
error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e);
|
|
||||||
return Ok(WorkerState::Done);
|
|
||||||
}
|
|
||||||
// Propagate error up to worker manager, it will log it, increment a counter,
|
|
||||||
// and sleep for a certain delay (with exponential backoff), waiting for
|
|
||||||
// things to go back to normal
|
|
||||||
return Err(e);
|
|
||||||
} else {
|
|
||||||
for k in entries_k {
|
|
||||||
self.buf.remove(&k);
|
|
||||||
}
|
|
||||||
self.errors = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(WorkerState::Busy);
|
|
||||||
} else if closed {
|
|
||||||
return Ok(WorkerState::Done);
|
|
||||||
} else {
|
|
||||||
return Ok(WorkerState::Idle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
|
|
||||||
match self.propagate_rx.recv().await {
|
|
||||||
Some((pk, sk, counters)) => {
|
|
||||||
self.add_ent(pk, sk, counters);
|
|
||||||
WorkerState::Busy
|
|
||||||
}
|
|
||||||
None => match self.buf.is_empty() {
|
|
||||||
false => WorkerState::Busy,
|
|
||||||
true => WorkerState::Done,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
|
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
|
||||||
struct LocalCounterEntry<T: CountedItem> {
|
struct LocalCounterEntry<T: CountedItem> {
|
||||||
|
|
|
@ -273,14 +273,9 @@ impl K2VRpcHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> {
|
fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> {
|
||||||
let tree_key = self
|
|
||||||
.item_table
|
|
||||||
.data
|
|
||||||
.tree_key(&item.partition, &item.sort_key);
|
|
||||||
|
|
||||||
self.item_table
|
self.item_table
|
||||||
.data
|
.data
|
||||||
.update_entry_with(&tree_key[..], |ent| {
|
.update_entry_with(&item.partition, &item.sort_key, |ent| {
|
||||||
let mut ent = ent.unwrap_or_else(|| {
|
let mut ent = ent.unwrap_or_else(|| {
|
||||||
K2VItem::new(
|
K2VItem::new(
|
||||||
item.partition.bucket_id,
|
item.partition.bucket_id,
|
||||||
|
|
|
@ -4,7 +4,6 @@ use std::sync::Arc;
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use garage_table::crdt::*;
|
use garage_table::crdt::*;
|
||||||
|
@ -221,7 +220,6 @@ impl Crdt for Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ObjectTable {
|
pub struct ObjectTable {
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
|
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
|
||||||
pub object_counter_table: Arc<IndexCounter<Object>>,
|
pub object_counter_table: Arc<IndexCounter<Object>>,
|
||||||
}
|
}
|
||||||
|
@ -255,34 +253,34 @@ impl TableSchema for ObjectTable {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Spawn threads that propagates deletions to version table
|
// 2. Enqueue propagation deletions to version table
|
||||||
let version_table = self.version_table.clone();
|
if let (Some(old_v), Some(new_v)) = (old, new) {
|
||||||
let old = old.cloned();
|
// Propagate deletion of old versions
|
||||||
let new = new.cloned();
|
for v in old_v.versions.iter() {
|
||||||
|
let newly_deleted = match new_v
|
||||||
self.background.spawn(async move {
|
.versions
|
||||||
if let (Some(old_v), Some(new_v)) = (old, new) {
|
.binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key()))
|
||||||
// Propagate deletion of old versions
|
{
|
||||||
for v in old_v.versions.iter() {
|
Err(_) => true,
|
||||||
let newly_deleted = match new_v
|
Ok(i) => {
|
||||||
.versions
|
new_v.versions[i].state == ObjectVersionState::Aborted
|
||||||
.binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key()))
|
&& v.state != ObjectVersionState::Aborted
|
||||||
{
|
}
|
||||||
Err(_) => true,
|
};
|
||||||
Ok(i) => {
|
if newly_deleted {
|
||||||
new_v.versions[i].state == ObjectVersionState::Aborted
|
let deleted_version =
|
||||||
&& v.state != ObjectVersionState::Aborted
|
Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true);
|
||||||
}
|
let res = self.version_table.queue_insert(tx, &deleted_version);
|
||||||
};
|
if let Err(e) = db::unabort(res)? {
|
||||||
if newly_deleted {
|
error!(
|
||||||
let deleted_version =
|
"Unable to enqueue version deletion propagation: {}. A repair will be needed.",
|
||||||
Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true);
|
e
|
||||||
version_table.insert(&deleted_version).await?;
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
}
|
||||||
});
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ use std::sync::Arc;
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use garage_table::crdt::*;
|
use garage_table::crdt::*;
|
||||||
|
@ -127,7 +126,6 @@ impl Crdt for Version {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct VersionTable {
|
pub struct VersionTable {
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
|
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -141,33 +139,26 @@ impl TableSchema for VersionTable {
|
||||||
|
|
||||||
fn updated(
|
fn updated(
|
||||||
&self,
|
&self,
|
||||||
_tx: &mut db::Transaction,
|
tx: &mut db::Transaction,
|
||||||
old: Option<&Self::E>,
|
old: Option<&Self::E>,
|
||||||
new: Option<&Self::E>,
|
new: Option<&Self::E>,
|
||||||
) -> db::TxOpResult<()> {
|
) -> db::TxOpResult<()> {
|
||||||
let block_ref_table = self.block_ref_table.clone();
|
if let (Some(old_v), Some(new_v)) = (old, new) {
|
||||||
let old = old.cloned();
|
// Propagate deletion of version blocks
|
||||||
let new = new.cloned();
|
if new_v.deleted.get() && !old_v.deleted.get() {
|
||||||
|
let deleted_block_refs = old_v.blocks.items().iter().map(|(_k, vb)| BlockRef {
|
||||||
self.background.spawn(async move {
|
block: vb.hash,
|
||||||
if let (Some(old_v), Some(new_v)) = (old, new) {
|
version: old_v.uuid,
|
||||||
// Propagate deletion of version blocks
|
deleted: true.into(),
|
||||||
if new_v.deleted.get() && !old_v.deleted.get() {
|
});
|
||||||
let deleted_block_refs = old_v
|
for block_ref in deleted_block_refs {
|
||||||
.blocks
|
let res = self.block_ref_table.queue_insert(tx, &block_ref);
|
||||||
.items()
|
if let Err(e) = db::unabort(res)? {
|
||||||
.iter()
|
error!("Unable to enqueue block ref deletion propagation: {}. A repair will be needed.", e);
|
||||||
.map(|(_k, vb)| BlockRef {
|
}
|
||||||
block: vb.hash,
|
|
||||||
version: old_v.uuid,
|
|
||||||
deleted: true.into(),
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
block_ref_table.insert_many(&deleted_block_refs[..]).await?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
}
|
||||||
});
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,6 @@ use std::time::Duration;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use futures::stream::futures_unordered::FuturesUnordered;
|
use futures::stream::futures_unordered::FuturesUnordered;
|
||||||
use futures::stream::StreamExt;
|
use futures::stream::StreamExt;
|
||||||
use futures_util::future::FutureExt;
|
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
|
|
||||||
|
@ -24,7 +23,6 @@ pub use netapp::message::{
|
||||||
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
||||||
pub use netapp::{self, NetApp, NodeID};
|
pub use netapp::{self, NetApp, NodeID};
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::Error;
|
||||||
use garage_util::metrics::RecordDuration;
|
use garage_util::metrics::RecordDuration;
|
||||||
|
@ -94,7 +92,6 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
|
||||||
struct RpcHelperInner {
|
struct RpcHelperInner {
|
||||||
our_node_id: Uuid,
|
our_node_id: Uuid,
|
||||||
fullmesh: Arc<FullMeshPeeringStrategy>,
|
fullmesh: Arc<FullMeshPeeringStrategy>,
|
||||||
background: Arc<BackgroundRunner>,
|
|
||||||
ring: watch::Receiver<Arc<Ring>>,
|
ring: watch::Receiver<Arc<Ring>>,
|
||||||
metrics: RpcMetrics,
|
metrics: RpcMetrics,
|
||||||
rpc_timeout: Duration,
|
rpc_timeout: Duration,
|
||||||
|
@ -104,7 +101,6 @@ impl RpcHelper {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
our_node_id: Uuid,
|
our_node_id: Uuid,
|
||||||
fullmesh: Arc<FullMeshPeeringStrategy>,
|
fullmesh: Arc<FullMeshPeeringStrategy>,
|
||||||
background: Arc<BackgroundRunner>,
|
|
||||||
ring: watch::Receiver<Arc<Ring>>,
|
ring: watch::Receiver<Arc<Ring>>,
|
||||||
rpc_timeout: Option<Duration>,
|
rpc_timeout: Option<Duration>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
|
@ -113,7 +109,6 @@ impl RpcHelper {
|
||||||
Self(Arc::new(RpcHelperInner {
|
Self(Arc::new(RpcHelperInner {
|
||||||
our_node_id,
|
our_node_id,
|
||||||
fullmesh,
|
fullmesh,
|
||||||
background,
|
|
||||||
ring,
|
ring,
|
||||||
metrics,
|
metrics,
|
||||||
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
|
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
|
||||||
|
@ -377,16 +372,13 @@ impl RpcHelper {
|
||||||
|
|
||||||
if !resp_stream.is_empty() {
|
if !resp_stream.is_empty() {
|
||||||
// Continue remaining requests in background.
|
// Continue remaining requests in background.
|
||||||
// Continue the remaining requests immediately using tokio::spawn
|
// Note: these requests can get interrupted on process shutdown,
|
||||||
// but enqueue a task in the background runner
|
// we must not count on them being executed for certain.
|
||||||
// to ensure that the process won't exit until the requests are done
|
// For all background things that have to happen with certainty,
|
||||||
// (if we had just enqueued the resp_stream.collect directly in the background runner,
|
// they have to be put in a proper queue that is persisted to disk.
|
||||||
// the requests might have been put on hold in the background runner's queue,
|
tokio::spawn(async move {
|
||||||
// in which case they might timeout or otherwise fail)
|
|
||||||
let wait_finished_fut = tokio::spawn(async move {
|
|
||||||
resp_stream.collect::<Vec<Result<_, _>>>().await;
|
resp_stream.collect::<Vec<Result<_, _>>>().await;
|
||||||
});
|
});
|
||||||
self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
||||||
use netapp::util::parse_and_resolve_peer_addr_async;
|
use netapp::util::parse_and_resolve_peer_addr_async;
|
||||||
use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
|
use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
|
||||||
|
|
||||||
use garage_util::background::BackgroundRunner;
|
|
||||||
use garage_util::config::Config;
|
use garage_util::config::Config;
|
||||||
#[cfg(feature = "kubernetes-discovery")]
|
#[cfg(feature = "kubernetes-discovery")]
|
||||||
use garage_util::config::KubernetesDiscoveryConfig;
|
use garage_util::config::KubernetesDiscoveryConfig;
|
||||||
|
@ -50,8 +49,6 @@ pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008
|
||||||
/// RPC endpoint used for calls related to membership
|
/// RPC endpoint used for calls related to membership
|
||||||
pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
|
pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
|
||||||
|
|
||||||
pub const CONNECT_ERROR_MESSAGE: &str = "Error establishing RPC connection to remote node. This can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret";
|
|
||||||
|
|
||||||
/// RPC messages related to membership
|
/// RPC messages related to membership
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub enum SystemRpc {
|
pub enum SystemRpc {
|
||||||
|
@ -110,9 +107,6 @@ pub struct System {
|
||||||
pub ring: watch::Receiver<Arc<Ring>>,
|
pub ring: watch::Receiver<Arc<Ring>>,
|
||||||
update_ring: Mutex<watch::Sender<Arc<Ring>>>,
|
update_ring: Mutex<watch::Sender<Arc<Ring>>>,
|
||||||
|
|
||||||
/// The job runner of this node
|
|
||||||
pub background: Arc<BackgroundRunner>,
|
|
||||||
|
|
||||||
/// Path to metadata directory
|
/// Path to metadata directory
|
||||||
pub metadata_dir: PathBuf,
|
pub metadata_dir: PathBuf,
|
||||||
}
|
}
|
||||||
|
@ -232,7 +226,6 @@ impl System {
|
||||||
/// Create this node's membership manager
|
/// Create this node's membership manager
|
||||||
pub fn new(
|
pub fn new(
|
||||||
network_key: NetworkKey,
|
network_key: NetworkKey,
|
||||||
background: Arc<BackgroundRunner>,
|
|
||||||
replication_mode: ReplicationMode,
|
replication_mode: ReplicationMode,
|
||||||
config: &Config,
|
config: &Config,
|
||||||
) -> Result<Arc<Self>, Error> {
|
) -> Result<Arc<Self>, Error> {
|
||||||
|
@ -354,7 +347,6 @@ impl System {
|
||||||
rpc: RpcHelper::new(
|
rpc: RpcHelper::new(
|
||||||
netapp.id.into(),
|
netapp.id.into(),
|
||||||
fullmesh,
|
fullmesh,
|
||||||
background.clone(),
|
|
||||||
ring.clone(),
|
ring.clone(),
|
||||||
config.rpc_timeout_msec.map(Duration::from_millis),
|
config.rpc_timeout_msec.map(Duration::from_millis),
|
||||||
),
|
),
|
||||||
|
@ -372,7 +364,6 @@ impl System {
|
||||||
|
|
||||||
ring,
|
ring,
|
||||||
update_ring: Mutex::new(update_ring),
|
update_ring: Mutex::new(update_ring),
|
||||||
background,
|
|
||||||
metadata_dir: config.metadata_dir.clone(),
|
metadata_dir: config.metadata_dir.clone(),
|
||||||
});
|
});
|
||||||
sys.system_endpoint.set_handler(sys.clone());
|
sys.system_endpoint.set_handler(sys.clone());
|
||||||
|
@ -444,17 +435,14 @@ impl System {
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
let mut errors = vec![];
|
let mut errors = vec![];
|
||||||
for ip in addrs.iter() {
|
for addr in addrs.iter() {
|
||||||
match self
|
match self.netapp.clone().try_connect(*addr, pubkey).await {
|
||||||
.netapp
|
|
||||||
.clone()
|
|
||||||
.try_connect(*ip, pubkey)
|
|
||||||
.await
|
|
||||||
.err_context(CONNECT_ERROR_MESSAGE)
|
|
||||||
{
|
|
||||||
Ok(()) => return Ok(()),
|
Ok(()) => return Ok(()),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
errors.push((*ip, e));
|
errors.push((
|
||||||
|
*addr,
|
||||||
|
Error::Message(connect_error_message(*addr, pubkey, e)),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -578,7 +566,7 @@ impl System {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save network configuration to disc
|
/// Save network configuration to disc
|
||||||
async fn save_cluster_layout(self: Arc<Self>) -> Result<(), Error> {
|
async fn save_cluster_layout(&self) -> Result<(), Error> {
|
||||||
let ring: Arc<Ring> = self.ring.borrow().clone();
|
let ring: Arc<Ring> = self.ring.borrow().clone();
|
||||||
self.persist_cluster_layout
|
self.persist_cluster_layout
|
||||||
.save_async(&ring.layout)
|
.save_async(&ring.layout)
|
||||||
|
@ -630,11 +618,7 @@ impl System {
|
||||||
if info.cluster_layout_version > local_info.cluster_layout_version
|
if info.cluster_layout_version > local_info.cluster_layout_version
|
||||||
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
|
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
|
||||||
{
|
{
|
||||||
let self2 = self.clone();
|
tokio::spawn(self.clone().pull_cluster_layout(from));
|
||||||
self.background.spawn_cancellable(async move {
|
|
||||||
self2.pull_cluster_layout(from).await;
|
|
||||||
Ok(())
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.node_status
|
self.node_status
|
||||||
|
@ -676,18 +660,21 @@ impl System {
|
||||||
drop(update_ring);
|
drop(update_ring);
|
||||||
|
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
self.background.spawn_cancellable(async move {
|
tokio::spawn(async move {
|
||||||
self2
|
if let Err(e) = self2
|
||||||
.rpc
|
.rpc
|
||||||
.broadcast(
|
.broadcast(
|
||||||
&self2.system_endpoint,
|
&self2.system_endpoint,
|
||||||
SystemRpc::AdvertiseClusterLayout(layout),
|
SystemRpc::AdvertiseClusterLayout(layout),
|
||||||
RequestStrategy::with_priority(PRIO_HIGH),
|
RequestStrategy::with_priority(PRIO_HIGH),
|
||||||
)
|
)
|
||||||
.await?;
|
.await
|
||||||
Ok(())
|
{
|
||||||
|
warn!("Error while broadcasting new cluster layout: {}", e);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
self.background.spawn(self.clone().save_cluster_layout());
|
|
||||||
|
self.save_cluster_layout().await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(SystemRpc::Ok)
|
Ok(SystemRpc::Ok)
|
||||||
|
@ -773,12 +760,12 @@ impl System {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (node_id, node_addr) in ping_list {
|
for (node_id, node_addr) in ping_list {
|
||||||
tokio::spawn(
|
let self2 = self.clone();
|
||||||
self.netapp
|
tokio::spawn(async move {
|
||||||
.clone()
|
if let Err(e) = self2.netapp.clone().try_connect(node_addr, node_id).await {
|
||||||
.try_connect(node_addr, node_id)
|
error!("{}", connect_error_message(node_addr, node_id, e));
|
||||||
.map(|r| r.err_context(CONNECT_ERROR_MESSAGE)),
|
}
|
||||||
);
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -787,11 +774,10 @@ impl System {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "consul-discovery")]
|
#[cfg(feature = "consul-discovery")]
|
||||||
self.background.spawn(self.clone().advertise_to_consul());
|
background::spawn(self.clone().advertise_to_consul());
|
||||||
|
|
||||||
#[cfg(feature = "kubernetes-discovery")]
|
#[cfg(feature = "kubernetes-discovery")]
|
||||||
self.background
|
background::spawn(self.clone().advertise_to_kubernetes());
|
||||||
.spawn(self.clone().advertise_to_kubernetes());
|
|
||||||
|
|
||||||
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
|
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
|
||||||
select! {
|
select! {
|
||||||
|
@ -881,3 +867,11 @@ async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
|
||||||
|
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn connect_error_message(
|
||||||
|
addr: SocketAddr,
|
||||||
|
pubkey: ed25519::PublicKey,
|
||||||
|
e: netapp::error::Error,
|
||||||
|
) -> String {
|
||||||
|
format!("Error establishing RPC connection to remote node: {}@{}.\nThis can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret.\n{}", hex::encode(pubkey), addr, e)
|
||||||
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ garage_util = { version = "0.8.1", path = "../util" }
|
||||||
opentelemetry = "0.17"
|
opentelemetry = "0.17"
|
||||||
|
|
||||||
async-trait = "0.1.7"
|
async-trait = "0.1.7"
|
||||||
|
arc-swap = "1.0"
|
||||||
bytes = "1.0"
|
bytes = "1.0"
|
||||||
hex = "0.4"
|
hex = "0.4"
|
||||||
hexdump = "0.1"
|
hexdump = "0.1"
|
||||||
|
|
|
@ -31,6 +31,10 @@ pub struct TableData<F: TableSchema, R: TableReplication> {
|
||||||
pub(crate) merkle_tree: db::Tree,
|
pub(crate) merkle_tree: db::Tree,
|
||||||
pub(crate) merkle_todo: db::Tree,
|
pub(crate) merkle_todo: db::Tree,
|
||||||
pub(crate) merkle_todo_notify: Notify,
|
pub(crate) merkle_todo_notify: Notify,
|
||||||
|
|
||||||
|
pub(crate) insert_queue: db::Tree,
|
||||||
|
pub(crate) insert_queue_notify: Notify,
|
||||||
|
|
||||||
pub(crate) gc_todo: CountedTree,
|
pub(crate) gc_todo: CountedTree,
|
||||||
|
|
||||||
pub(crate) metrics: TableMetrics,
|
pub(crate) metrics: TableMetrics,
|
||||||
|
@ -53,9 +57,13 @@ where
|
||||||
.open_tree(&format!("{}:merkle_todo", F::TABLE_NAME))
|
.open_tree(&format!("{}:merkle_todo", F::TABLE_NAME))
|
||||||
.expect("Unable to open DB Merkle TODO tree");
|
.expect("Unable to open DB Merkle TODO tree");
|
||||||
|
|
||||||
|
let insert_queue = db
|
||||||
|
.open_tree(&format!("{}:insert_queue", F::TABLE_NAME))
|
||||||
|
.expect("Unable to open insert queue DB tree");
|
||||||
|
|
||||||
let gc_todo = db
|
let gc_todo = db
|
||||||
.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
|
.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
|
||||||
.expect("Unable to open DB tree");
|
.expect("Unable to open GC DB tree");
|
||||||
let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");
|
let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");
|
||||||
|
|
||||||
let metrics = TableMetrics::new(
|
let metrics = TableMetrics::new(
|
||||||
|
@ -74,6 +82,8 @@ where
|
||||||
merkle_tree,
|
merkle_tree,
|
||||||
merkle_todo,
|
merkle_todo,
|
||||||
merkle_todo_notify: Notify::new(),
|
merkle_todo_notify: Notify::new(),
|
||||||
|
insert_queue,
|
||||||
|
insert_queue_notify: Notify::new(),
|
||||||
gc_todo,
|
gc_todo,
|
||||||
metrics,
|
metrics,
|
||||||
})
|
})
|
||||||
|
@ -173,9 +183,8 @@ where
|
||||||
|
|
||||||
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
|
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
|
||||||
let update = self.decode_entry(update_bytes)?;
|
let update = self.decode_entry(update_bytes)?;
|
||||||
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
|
|
||||||
|
|
||||||
self.update_entry_with(&tree_key[..], |ent| match ent {
|
self.update_entry_with(update.partition_key(), update.sort_key(), |ent| match ent {
|
||||||
Some(mut ent) => {
|
Some(mut ent) => {
|
||||||
ent.merge(&update);
|
ent.merge(&update);
|
||||||
ent
|
ent
|
||||||
|
@ -187,11 +196,14 @@ where
|
||||||
|
|
||||||
pub fn update_entry_with(
|
pub fn update_entry_with(
|
||||||
&self,
|
&self,
|
||||||
tree_key: &[u8],
|
partition_key: &F::P,
|
||||||
|
sort_key: &F::S,
|
||||||
f: impl Fn(Option<F::E>) -> F::E,
|
f: impl Fn(Option<F::E>) -> F::E,
|
||||||
) -> Result<Option<F::E>, Error> {
|
) -> Result<Option<F::E>, Error> {
|
||||||
|
let tree_key = self.tree_key(partition_key, sort_key);
|
||||||
|
|
||||||
let changed = self.store.db().transaction(|mut tx| {
|
let changed = self.store.db().transaction(|mut tx| {
|
||||||
let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, tree_key)? {
|
let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, &tree_key)? {
|
||||||
Some(old_bytes) => {
|
Some(old_bytes) => {
|
||||||
let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?;
|
let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?;
|
||||||
let new_entry = f(Some(old_entry.clone()));
|
let new_entry = f(Some(old_entry.clone()));
|
||||||
|
@ -200,23 +212,23 @@ where
|
||||||
None => (None, None, f(None)),
|
None => (None, None, f(None)),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Scenario 1: the value changed, so of course there is a change
|
// Changed can be true in two scenarios
|
||||||
let value_changed = Some(&new_entry) != old_entry.as_ref();
|
// Scenario 1: the actual represented value changed,
|
||||||
|
// so of course the messagepack encoding changed as well
|
||||||
// Scenario 2: the value didn't change but due to a migration in the
|
// Scenario 2: the value didn't change but due to a migration in the
|
||||||
// data format, the messagepack encoding changed. In this case
|
// data format, the messagepack encoding changed. In this case,
|
||||||
// we have to write the migrated value in the table and update
|
// we also have to write the migrated value in the table and update
|
||||||
// the associated Merkle tree entry.
|
// the associated Merkle tree entry.
|
||||||
let new_bytes = rmp_to_vec_all_named(&new_entry)
|
let new_bytes = rmp_to_vec_all_named(&new_entry)
|
||||||
.map_err(Error::RmpEncode)
|
.map_err(Error::RmpEncode)
|
||||||
.map_err(db::TxError::Abort)?;
|
.map_err(db::TxError::Abort)?;
|
||||||
let encoding_changed = Some(&new_bytes[..]) != old_bytes.as_ref().map(|x| &x[..]);
|
let changed = Some(&new_bytes[..]) != old_bytes.as_deref();
|
||||||
drop(old_bytes);
|
drop(old_bytes);
|
||||||
|
|
||||||
if value_changed || encoding_changed {
|
if changed {
|
||||||
let new_bytes_hash = blake2sum(&new_bytes[..]);
|
let new_bytes_hash = blake2sum(&new_bytes);
|
||||||
tx.insert(&self.merkle_todo, tree_key, new_bytes_hash.as_slice())?;
|
tx.insert(&self.merkle_todo, &tree_key, new_bytes_hash.as_slice())?;
|
||||||
tx.insert(&self.store, tree_key, new_bytes)?;
|
tx.insert(&self.store, &tree_key, new_bytes)?;
|
||||||
|
|
||||||
self.instance
|
self.instance
|
||||||
.updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?;
|
.updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?;
|
||||||
|
@ -242,7 +254,7 @@ where
|
||||||
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
|
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
|
||||||
let nodes = self.replication.write_nodes(&pk_hash);
|
let nodes = self.replication.write_nodes(&pk_hash);
|
||||||
if nodes.first() == Some(&self.system.id) {
|
if nodes.first() == Some(&self.system.id) {
|
||||||
GcTodoEntry::new(tree_key.to_vec(), new_bytes_hash).save(&self.gc_todo)?;
|
GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -258,10 +270,11 @@ where
|
||||||
.db()
|
.db()
|
||||||
.transaction(|mut tx| match tx.get(&self.store, k)? {
|
.transaction(|mut tx| match tx.get(&self.store, k)? {
|
||||||
Some(cur_v) if cur_v == v => {
|
Some(cur_v) if cur_v == v => {
|
||||||
|
let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;
|
||||||
|
|
||||||
tx.remove(&self.store, k)?;
|
tx.remove(&self.store, k)?;
|
||||||
tx.insert(&self.merkle_todo, k, vec![])?;
|
tx.insert(&self.merkle_todo, k, vec![])?;
|
||||||
|
|
||||||
let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;
|
|
||||||
self.instance.updated(&mut tx, Some(&old_entry), None)?;
|
self.instance.updated(&mut tx, Some(&old_entry), None)?;
|
||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
@ -285,10 +298,11 @@ where
|
||||||
.db()
|
.db()
|
||||||
.transaction(|mut tx| match tx.get(&self.store, k)? {
|
.transaction(|mut tx| match tx.get(&self.store, k)? {
|
||||||
Some(cur_v) if blake2sum(&cur_v[..]) == vhash => {
|
Some(cur_v) if blake2sum(&cur_v[..]) == vhash => {
|
||||||
|
let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;
|
||||||
|
|
||||||
tx.remove(&self.store, k)?;
|
tx.remove(&self.store, k)?;
|
||||||
tx.insert(&self.merkle_todo, k, vec![])?;
|
tx.insert(&self.merkle_todo, k, vec![])?;
|
||||||
|
|
||||||
let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;
|
|
||||||
self.instance.updated(&mut tx, Some(&old_entry), None)?;
|
self.instance.updated(&mut tx, Some(&old_entry), None)?;
|
||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
@ -302,6 +316,32 @@ where
|
||||||
Ok(removed)
|
Ok(removed)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---- Insert queue functions ----
|
||||||
|
|
||||||
|
pub(crate) fn queue_insert(
|
||||||
|
&self,
|
||||||
|
tx: &mut db::Transaction,
|
||||||
|
ins: &F::E,
|
||||||
|
) -> db::TxResult<(), Error> {
|
||||||
|
let tree_key = self.tree_key(ins.partition_key(), ins.sort_key());
|
||||||
|
|
||||||
|
let new_entry = match tx.get(&self.insert_queue, &tree_key)? {
|
||||||
|
Some(old_v) => {
|
||||||
|
let mut entry = self.decode_entry(&old_v).map_err(db::TxError::Abort)?;
|
||||||
|
entry.merge(ins);
|
||||||
|
rmp_to_vec_all_named(&entry)
|
||||||
|
}
|
||||||
|
None => rmp_to_vec_all_named(ins),
|
||||||
|
};
|
||||||
|
let new_entry = new_entry
|
||||||
|
.map_err(Error::RmpEncode)
|
||||||
|
.map_err(db::TxError::Abort)?;
|
||||||
|
tx.insert(&self.insert_queue, &tree_key, new_entry)?;
|
||||||
|
self.insert_queue_notify.notify_one();
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// ---- Utility functions ----
|
// ---- Utility functions ----
|
||||||
|
|
||||||
pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
|
pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
|
||||||
|
|
|
@ -54,24 +54,25 @@ where
|
||||||
F: TableSchema + 'static,
|
F: TableSchema + 'static,
|
||||||
R: TableReplication + 'static,
|
R: TableReplication + 'static,
|
||||||
{
|
{
|
||||||
pub(crate) fn launch(system: Arc<System>, data: Arc<TableData<F, R>>) -> Arc<Self> {
|
pub(crate) fn new(system: Arc<System>, data: Arc<TableData<F, R>>) -> Arc<Self> {
|
||||||
let endpoint = system
|
let endpoint = system
|
||||||
.netapp
|
.netapp
|
||||||
.endpoint(format!("garage_table/gc.rs/Rpc:{}", F::TABLE_NAME));
|
.endpoint(format!("garage_table/gc.rs/Rpc:{}", F::TABLE_NAME));
|
||||||
|
|
||||||
let gc = Arc::new(Self {
|
let gc = Arc::new(Self {
|
||||||
system: system.clone(),
|
system,
|
||||||
data,
|
data,
|
||||||
endpoint,
|
endpoint,
|
||||||
});
|
});
|
||||||
|
|
||||||
gc.endpoint.set_handler(gc.clone());
|
gc.endpoint.set_handler(gc.clone());
|
||||||
|
|
||||||
system.background.spawn_worker(GcWorker::new(gc.clone()));
|
|
||||||
|
|
||||||
gc
|
gc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
|
bg.spawn_worker(GcWorker::new(self.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
|
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
|
||||||
let now = now_msec();
|
let now = now_msec();
|
||||||
|
|
||||||
|
@ -347,10 +348,7 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
|
||||||
return WorkerState::Done;
|
|
||||||
}
|
|
||||||
tokio::time::sleep(self.wait_delay).await;
|
tokio::time::sleep(self.wait_delay).await;
|
||||||
WorkerState::Busy
|
WorkerState::Busy
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,16 +4,18 @@
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tracing;
|
extern crate tracing;
|
||||||
|
|
||||||
mod metrics;
|
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
||||||
pub mod data;
|
pub mod data;
|
||||||
|
pub mod replication;
|
||||||
|
pub mod table;
|
||||||
|
|
||||||
mod gc;
|
mod gc;
|
||||||
mod merkle;
|
mod merkle;
|
||||||
pub mod replication;
|
mod metrics;
|
||||||
|
mod queue;
|
||||||
mod sync;
|
mod sync;
|
||||||
pub mod table;
|
|
||||||
|
|
||||||
pub use schema::*;
|
pub use schema::*;
|
||||||
pub use table::*;
|
pub use table::*;
|
||||||
|
|
|
@ -3,6 +3,7 @@ use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tokio::select;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
@ -69,17 +70,17 @@ where
|
||||||
F: TableSchema + 'static,
|
F: TableSchema + 'static,
|
||||||
R: TableReplication + 'static,
|
R: TableReplication + 'static,
|
||||||
{
|
{
|
||||||
pub(crate) fn launch(background: &BackgroundRunner, data: Arc<TableData<F, R>>) -> Arc<Self> {
|
pub(crate) fn new(data: Arc<TableData<F, R>>) -> Arc<Self> {
|
||||||
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
|
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
|
||||||
|
|
||||||
let ret = Arc::new(Self {
|
Arc::new(Self {
|
||||||
data,
|
data,
|
||||||
empty_node_hash,
|
empty_node_hash,
|
||||||
});
|
})
|
||||||
|
}
|
||||||
|
|
||||||
background.spawn_worker(MerkleWorker(ret.clone()));
|
pub(crate) fn spawn_workers(self: &Arc<Self>, background: &BackgroundRunner) {
|
||||||
|
background.spawn_worker(MerkleWorker(self.clone()));
|
||||||
ret
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn updater_loop_iter(&self) -> Result<WorkerState, Error> {
|
fn updater_loop_iter(&self) -> Result<WorkerState, Error> {
|
||||||
|
@ -339,11 +340,11 @@ where
|
||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
select! {
|
||||||
return WorkerState::Done;
|
_ = tokio::time::sleep(Duration::from_secs(60)) => (),
|
||||||
|
_ = self.0.data.merkle_todo_notify.notified() => (),
|
||||||
}
|
}
|
||||||
tokio::time::sleep(Duration::from_secs(10)).await;
|
|
||||||
WorkerState::Busy
|
WorkerState::Busy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
81
src/table/queue.rs
Normal file
81
src/table/queue.rs
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use tokio::select;
|
||||||
|
use tokio::sync::watch;
|
||||||
|
|
||||||
|
use garage_util::background::*;
|
||||||
|
use garage_util::error::Error;
|
||||||
|
|
||||||
|
use crate::replication::*;
|
||||||
|
use crate::schema::*;
|
||||||
|
use crate::table::*;
|
||||||
|
|
||||||
|
const BATCH_SIZE: usize = 100;
|
||||||
|
|
||||||
|
pub(crate) struct InsertQueueWorker<F, R>(pub(crate) Arc<Table<F, R>>)
|
||||||
|
where
|
||||||
|
F: TableSchema + 'static,
|
||||||
|
R: TableReplication + 'static;
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<F, R> Worker for InsertQueueWorker<F, R>
|
||||||
|
where
|
||||||
|
F: TableSchema + 'static,
|
||||||
|
R: TableReplication + 'static,
|
||||||
|
{
|
||||||
|
fn name(&self) -> String {
|
||||||
|
format!("{} queue", F::TABLE_NAME)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn status(&self) -> WorkerStatus {
|
||||||
|
WorkerStatus {
|
||||||
|
queue_length: Some(self.0.data.insert_queue.len().unwrap_or(0) as u64),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
||||||
|
let mut kv_pairs = vec![];
|
||||||
|
let mut values = vec![];
|
||||||
|
|
||||||
|
for entry_kv in self.0.data.insert_queue.iter()? {
|
||||||
|
let (k, v) = entry_kv?;
|
||||||
|
|
||||||
|
values.push(self.0.data.decode_entry(&v)?);
|
||||||
|
kv_pairs.push((k, v));
|
||||||
|
|
||||||
|
if kv_pairs.len() > BATCH_SIZE {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if kv_pairs.is_empty() {
|
||||||
|
return Ok(WorkerState::Idle);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.0.insert_many(values).await?;
|
||||||
|
|
||||||
|
self.0.data.insert_queue.db().transaction(|mut tx| {
|
||||||
|
for (k, v) in kv_pairs.iter() {
|
||||||
|
if let Some(v2) = tx.get(&self.0.data.insert_queue, k)? {
|
||||||
|
if &v2 == v {
|
||||||
|
tx.remove(&self.0.data.insert_queue, k)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(WorkerState::Busy)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
|
select! {
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(600)) => (),
|
||||||
|
_ = self.0.data.insert_queue_notify.notified() => (),
|
||||||
|
}
|
||||||
|
WorkerState::Busy
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,6 +2,7 @@ use std::collections::VecDeque;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use arc_swap::ArcSwapOption;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use futures_util::stream::*;
|
use futures_util::stream::*;
|
||||||
use opentelemetry::KeyValue;
|
use opentelemetry::KeyValue;
|
||||||
|
@ -13,7 +14,7 @@ use tokio::sync::{mpsc, watch};
|
||||||
|
|
||||||
use garage_util::background::*;
|
use garage_util::background::*;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::{Error, OkOrMessage};
|
||||||
|
|
||||||
use garage_rpc::ring::*;
|
use garage_rpc::ring::*;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
|
@ -32,7 +33,7 @@ pub struct TableSyncer<F: TableSchema + 'static, R: TableReplication + 'static>
|
||||||
data: Arc<TableData<F, R>>,
|
data: Arc<TableData<F, R>>,
|
||||||
merkle: Arc<MerkleUpdater<F, R>>,
|
merkle: Arc<MerkleUpdater<F, R>>,
|
||||||
|
|
||||||
add_full_sync_tx: mpsc::UnboundedSender<()>,
|
add_full_sync_tx: ArcSwapOption<mpsc::UnboundedSender<()>>,
|
||||||
endpoint: Arc<Endpoint<SyncRpc, Self>>,
|
endpoint: Arc<Endpoint<SyncRpc, Self>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,7 +66,7 @@ where
|
||||||
F: TableSchema + 'static,
|
F: TableSchema + 'static,
|
||||||
R: TableReplication + 'static,
|
R: TableReplication + 'static,
|
||||||
{
|
{
|
||||||
pub(crate) fn launch(
|
pub(crate) fn new(
|
||||||
system: Arc<System>,
|
system: Arc<System>,
|
||||||
data: Arc<TableData<F, R>>,
|
data: Arc<TableData<F, R>>,
|
||||||
merkle: Arc<MerkleUpdater<F, R>>,
|
merkle: Arc<MerkleUpdater<F, R>>,
|
||||||
|
@ -74,34 +75,40 @@ where
|
||||||
.netapp
|
.netapp
|
||||||
.endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME));
|
.endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME));
|
||||||
|
|
||||||
let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
|
|
||||||
|
|
||||||
let syncer = Arc::new(Self {
|
let syncer = Arc::new(Self {
|
||||||
system: system.clone(),
|
system,
|
||||||
data,
|
data,
|
||||||
merkle,
|
merkle,
|
||||||
add_full_sync_tx,
|
add_full_sync_tx: ArcSwapOption::new(None),
|
||||||
endpoint,
|
endpoint,
|
||||||
});
|
});
|
||||||
|
|
||||||
syncer.endpoint.set_handler(syncer.clone());
|
syncer.endpoint.set_handler(syncer.clone());
|
||||||
|
|
||||||
system.background.spawn_worker(SyncWorker {
|
|
||||||
syncer: syncer.clone(),
|
|
||||||
ring_recv: system.ring.clone(),
|
|
||||||
ring: system.ring.borrow().clone(),
|
|
||||||
add_full_sync_rx,
|
|
||||||
todo: vec![],
|
|
||||||
next_full_sync: Instant::now() + Duration::from_secs(20),
|
|
||||||
});
|
|
||||||
|
|
||||||
syncer
|
syncer
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_full_sync(&self) {
|
pub(crate) fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
if self.add_full_sync_tx.send(()).is_err() {
|
let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
|
||||||
error!("({}) Could not add full sync", F::TABLE_NAME);
|
self.add_full_sync_tx
|
||||||
}
|
.store(Some(Arc::new(add_full_sync_tx)));
|
||||||
|
|
||||||
|
bg.spawn_worker(SyncWorker {
|
||||||
|
syncer: self.clone(),
|
||||||
|
ring_recv: self.system.ring.clone(),
|
||||||
|
ring: self.system.ring.borrow().clone(),
|
||||||
|
add_full_sync_rx,
|
||||||
|
todo: vec![],
|
||||||
|
next_full_sync: Instant::now() + Duration::from_secs(20),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_full_sync(&self) -> Result<(), Error> {
|
||||||
|
let tx = self.add_full_sync_tx.load();
|
||||||
|
let tx = tx
|
||||||
|
.as_ref()
|
||||||
|
.ok_or_message("table sync worker is not running")?;
|
||||||
|
tx.send(()).ok_or_message("send error")?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----
|
// ----
|
||||||
|
@ -586,10 +593,7 @@ impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
if *must_exit.borrow() {
|
|
||||||
return WorkerState::Done;
|
|
||||||
}
|
|
||||||
select! {
|
select! {
|
||||||
s = self.add_full_sync_rx.recv() => {
|
s = self.add_full_sync_rx.recv() => {
|
||||||
if let Some(()) = s {
|
if let Some(()) = s {
|
||||||
|
|
|
@ -14,6 +14,7 @@ use opentelemetry::{
|
||||||
|
|
||||||
use garage_db as db;
|
use garage_db as db;
|
||||||
|
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::Error;
|
||||||
use garage_util::metrics::RecordDuration;
|
use garage_util::metrics::RecordDuration;
|
||||||
|
@ -25,6 +26,7 @@ use crate::crdt::Crdt;
|
||||||
use crate::data::*;
|
use crate::data::*;
|
||||||
use crate::gc::*;
|
use crate::gc::*;
|
||||||
use crate::merkle::*;
|
use crate::merkle::*;
|
||||||
|
use crate::queue::InsertQueueWorker;
|
||||||
use crate::replication::*;
|
use crate::replication::*;
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
use crate::sync::*;
|
use crate::sync::*;
|
||||||
|
@ -35,6 +37,7 @@ pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
|
||||||
pub data: Arc<TableData<F, R>>,
|
pub data: Arc<TableData<F, R>>,
|
||||||
pub merkle_updater: Arc<MerkleUpdater<F, R>>,
|
pub merkle_updater: Arc<MerkleUpdater<F, R>>,
|
||||||
pub syncer: Arc<TableSyncer<F, R>>,
|
pub syncer: Arc<TableSyncer<F, R>>,
|
||||||
|
gc: Arc<TableGc<F, R>>,
|
||||||
endpoint: Arc<Endpoint<TableRpc<F>, Self>>,
|
endpoint: Arc<Endpoint<TableRpc<F>, Self>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,15 +78,16 @@ where
|
||||||
|
|
||||||
let data = TableData::new(system.clone(), instance, replication, db);
|
let data = TableData::new(system.clone(), instance, replication, db);
|
||||||
|
|
||||||
let merkle_updater = MerkleUpdater::launch(&system.background, data.clone());
|
let merkle_updater = MerkleUpdater::new(data.clone());
|
||||||
|
|
||||||
let syncer = TableSyncer::launch(system.clone(), data.clone(), merkle_updater.clone());
|
let syncer = TableSyncer::new(system.clone(), data.clone(), merkle_updater.clone());
|
||||||
TableGc::launch(system.clone(), data.clone());
|
let gc = TableGc::new(system.clone(), data.clone());
|
||||||
|
|
||||||
let table = Arc::new(Self {
|
let table = Arc::new(Self {
|
||||||
system,
|
system,
|
||||||
data,
|
data,
|
||||||
merkle_updater,
|
merkle_updater,
|
||||||
|
gc,
|
||||||
syncer,
|
syncer,
|
||||||
endpoint,
|
endpoint,
|
||||||
});
|
});
|
||||||
|
@ -93,6 +97,13 @@ where
|
||||||
table
|
table
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
||||||
|
self.merkle_updater.spawn_workers(bg);
|
||||||
|
self.syncer.spawn_workers(bg);
|
||||||
|
self.gc.spawn_workers(bg);
|
||||||
|
bg.spawn_worker(InsertQueueWorker(self.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
||||||
let tracer = opentelemetry::global::tracer("garage_table");
|
let tracer = opentelemetry::global::tracer("garage_table");
|
||||||
let span = tracer.start(format!("{} insert", F::TABLE_NAME));
|
let span = tracer.start(format!("{} insert", F::TABLE_NAME));
|
||||||
|
@ -128,6 +139,11 @@ where
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Insert item locally
|
||||||
|
pub fn queue_insert(&self, tx: &mut db::Transaction, e: &F::E) -> db::TxResult<(), Error> {
|
||||||
|
self.data.queue_insert(tx, e)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error>
|
pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error>
|
||||||
where
|
where
|
||||||
I: IntoIterator<Item = IE> + Send + Sync,
|
I: IntoIterator<Item = IE> + Send + Sync,
|
||||||
|
@ -259,9 +275,11 @@ where
|
||||||
if not_all_same {
|
if not_all_same {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
let ent2 = ret_entry.clone();
|
let ent2 = ret_entry.clone();
|
||||||
self.system
|
tokio::spawn(async move {
|
||||||
.background
|
if let Err(e) = self2.repair_on_read(&who[..], ent2).await {
|
||||||
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
|
warn!("Error doing repair on read: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -358,11 +376,12 @@ where
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|k| ret.get(&k).unwrap().clone())
|
.map(|k| ret.get(&k).unwrap().clone())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
self.system.background.spawn_cancellable(async move {
|
tokio::spawn(async move {
|
||||||
for v in to_repair {
|
for v in to_repair {
|
||||||
self2.repair_on_read(&who[..], v).await?;
|
if let Err(e) = self2.repair_on_read(&who[..], v).await {
|
||||||
|
warn!("Error doing repair on read: {}", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
//! Job worker: a generic worker that just processes incoming
|
|
||||||
//! jobs one by one
|
|
||||||
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use async_trait::async_trait;
|
|
||||||
use tokio::sync::{mpsc, Mutex};
|
|
||||||
|
|
||||||
use crate::background::worker::*;
|
|
||||||
use crate::background::*;
|
|
||||||
|
|
||||||
pub(crate) struct JobWorker {
|
|
||||||
pub(crate) index: usize,
|
|
||||||
pub(crate) job_chan: Arc<Mutex<mpsc::UnboundedReceiver<(Job, bool)>>>,
|
|
||||||
pub(crate) next_job: Option<Job>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Worker for JobWorker {
|
|
||||||
fn name(&self) -> String {
|
|
||||||
format!("Job worker #{}", self.index)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
|
||||||
match self.next_job.take() {
|
|
||||||
None => return Ok(WorkerState::Idle),
|
|
||||||
Some(job) => {
|
|
||||||
job.await?;
|
|
||||||
Ok(WorkerState::Busy)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
|
|
||||||
loop {
|
|
||||||
match self.job_chan.lock().await.recv().await {
|
|
||||||
Some((job, cancellable)) => {
|
|
||||||
if cancellable && *must_exit.borrow() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
self.next_job = Some(job);
|
|
||||||
return WorkerState::Busy;
|
|
||||||
}
|
|
||||||
None => return WorkerState::Done,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,27 +1,18 @@
|
||||||
//! Job runner for futures and async functions
|
//! Job runner for futures and async functions
|
||||||
|
|
||||||
pub mod job_worker;
|
|
||||||
pub mod worker;
|
pub mod worker;
|
||||||
|
|
||||||
use core::future::Future;
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::pin::Pin;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::sync::{mpsc, watch, Mutex};
|
use tokio::sync::{mpsc, watch};
|
||||||
|
|
||||||
use crate::error::Error;
|
|
||||||
use worker::WorkerProcessor;
|
use worker::WorkerProcessor;
|
||||||
pub use worker::{Worker, WorkerState};
|
pub use worker::{Worker, WorkerState};
|
||||||
|
|
||||||
pub(crate) type JobOutput = Result<(), Error>;
|
|
||||||
pub(crate) type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
|
|
||||||
|
|
||||||
/// Job runner for futures and async functions
|
/// Job runner for futures and async functions
|
||||||
pub struct BackgroundRunner {
|
pub struct BackgroundRunner {
|
||||||
send_job: mpsc::UnboundedSender<(Job, bool)>,
|
|
||||||
send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
|
send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
|
||||||
worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
|
worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
|
||||||
}
|
}
|
||||||
|
@ -49,10 +40,7 @@ pub struct WorkerStatus {
|
||||||
|
|
||||||
impl BackgroundRunner {
|
impl BackgroundRunner {
|
||||||
/// Create a new BackgroundRunner
|
/// Create a new BackgroundRunner
|
||||||
pub fn new(
|
pub fn new(stop_signal: watch::Receiver<bool>) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
|
||||||
n_runners: usize,
|
|
||||||
stop_signal: watch::Receiver<bool>,
|
|
||||||
) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
|
|
||||||
let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
|
let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
|
||||||
|
|
||||||
let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
|
let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
|
||||||
|
@ -63,24 +51,7 @@ impl BackgroundRunner {
|
||||||
worker_processor.run().await;
|
worker_processor.run().await;
|
||||||
});
|
});
|
||||||
|
|
||||||
let (send_job, queue_out) = mpsc::unbounded_channel();
|
|
||||||
let queue_out = Arc::new(Mutex::new(queue_out));
|
|
||||||
|
|
||||||
for i in 0..n_runners {
|
|
||||||
let queue_out = queue_out.clone();
|
|
||||||
|
|
||||||
send_worker
|
|
||||||
.send(Box::new(job_worker::JobWorker {
|
|
||||||
index: i,
|
|
||||||
job_chan: queue_out.clone(),
|
|
||||||
next_job: None,
|
|
||||||
}))
|
|
||||||
.ok()
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let bgrunner = Arc::new(Self {
|
let bgrunner = Arc::new(Self {
|
||||||
send_job,
|
|
||||||
send_worker,
|
send_worker,
|
||||||
worker_info,
|
worker_info,
|
||||||
});
|
});
|
||||||
|
@ -91,31 +62,6 @@ impl BackgroundRunner {
|
||||||
self.worker_info.lock().unwrap().clone()
|
self.worker_info.lock().unwrap().clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawn a task to be run in background
|
|
||||||
pub fn spawn<T>(&self, job: T)
|
|
||||||
where
|
|
||||||
T: Future<Output = JobOutput> + Send + 'static,
|
|
||||||
{
|
|
||||||
let boxed: Job = Box::pin(job);
|
|
||||||
self.send_job
|
|
||||||
.send((boxed, false))
|
|
||||||
.ok()
|
|
||||||
.expect("Could not put job in queue");
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Spawn a task to be run in background. It may get discarded before running if spawned while
|
|
||||||
/// the runner is stopping
|
|
||||||
pub fn spawn_cancellable<T>(&self, job: T)
|
|
||||||
where
|
|
||||||
T: Future<Output = JobOutput> + Send + 'static,
|
|
||||||
{
|
|
||||||
let boxed: Job = Box::pin(job);
|
|
||||||
self.send_job
|
|
||||||
.send((boxed, true))
|
|
||||||
.ok()
|
|
||||||
.expect("Could not put job in queue");
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn spawn_worker<W>(&self, worker: W)
|
pub fn spawn_worker<W>(&self, worker: W)
|
||||||
where
|
where
|
||||||
W: Worker + 'static,
|
W: Worker + 'static,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use futures::future::*;
|
use futures::future::*;
|
||||||
|
@ -14,6 +14,10 @@ use crate::background::{WorkerInfo, WorkerStatus};
|
||||||
use crate::error::Error;
|
use crate::error::Error;
|
||||||
use crate::time::now_msec;
|
use crate::time::now_msec;
|
||||||
|
|
||||||
|
// All workers that haven't exited for this time after an exit signal was recieved
|
||||||
|
// will be interrupted in the middle of whatever they are doing.
|
||||||
|
const EXIT_DEADLINE: Duration = Duration::from_secs(8);
|
||||||
|
|
||||||
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
|
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
|
||||||
pub enum WorkerState {
|
pub enum WorkerState {
|
||||||
Busy,
|
Busy,
|
||||||
|
@ -50,10 +54,8 @@ pub trait Worker: Send {
|
||||||
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
|
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
|
||||||
|
|
||||||
/// Wait for work: await for some task to become available. This future can be interrupted in
|
/// Wait for work: await for some task to become available. This future can be interrupted in
|
||||||
/// the middle for any reason. This future doesn't have to await on must_exit.changed(), we
|
/// the middle for any reason, for example if an interrupt signal was recieved.
|
||||||
/// are doing it for you. Therefore it only receives a read refernce to must_exit which allows
|
async fn wait_for_work(&mut self) -> WorkerState;
|
||||||
/// it to check if we are exiting.
|
|
||||||
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct WorkerProcessor {
|
pub(crate) struct WorkerProcessor {
|
||||||
|
@ -93,11 +95,9 @@ impl WorkerProcessor {
|
||||||
let task_id = next_task_id;
|
let task_id = next_task_id;
|
||||||
next_task_id += 1;
|
next_task_id += 1;
|
||||||
let stop_signal = self.stop_signal.clone();
|
let stop_signal = self.stop_signal.clone();
|
||||||
let stop_signal_worker = self.stop_signal.clone();
|
|
||||||
let mut worker = WorkerHandler {
|
let mut worker = WorkerHandler {
|
||||||
task_id,
|
task_id,
|
||||||
stop_signal,
|
stop_signal,
|
||||||
stop_signal_worker,
|
|
||||||
worker: new_worker,
|
worker: new_worker,
|
||||||
state: WorkerState::Busy,
|
state: WorkerState::Busy,
|
||||||
errors: 0,
|
errors: 0,
|
||||||
|
@ -153,26 +153,14 @@ impl WorkerProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
// We are exiting, drain everything
|
// We are exiting, drain everything
|
||||||
let drain_half_time = Instant::now() + Duration::from_secs(5);
|
|
||||||
let drain_everything = async move {
|
let drain_everything = async move {
|
||||||
while let Some(mut worker) = workers.next().await {
|
while let Some(worker) = workers.next().await {
|
||||||
if worker.state == WorkerState::Done {
|
info!(
|
||||||
info!(
|
"Worker {} (TID {}) exited (last state: {:?})",
|
||||||
"Worker {} (TID {}) exited",
|
worker.worker.name(),
|
||||||
worker.worker.name(),
|
worker.task_id,
|
||||||
worker.task_id
|
worker.state
|
||||||
);
|
);
|
||||||
} else if Instant::now() > drain_half_time {
|
|
||||||
warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
|
|
||||||
} else {
|
|
||||||
workers.push(
|
|
||||||
async move {
|
|
||||||
worker.step().await;
|
|
||||||
worker
|
|
||||||
}
|
|
||||||
.boxed(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -180,7 +168,7 @@ impl WorkerProcessor {
|
||||||
_ = drain_everything => {
|
_ = drain_everything => {
|
||||||
info!("All workers exited peacefully \\o/");
|
info!("All workers exited peacefully \\o/");
|
||||||
}
|
}
|
||||||
_ = tokio::time::sleep(Duration::from_secs(9)) => {
|
_ = tokio::time::sleep(EXIT_DEADLINE) => {
|
||||||
error!("Some workers could not exit in time, we are cancelling some things in the middle");
|
error!("Some workers could not exit in time, we are cancelling some things in the middle");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -190,7 +178,6 @@ impl WorkerProcessor {
|
||||||
struct WorkerHandler {
|
struct WorkerHandler {
|
||||||
task_id: usize,
|
task_id: usize,
|
||||||
stop_signal: watch::Receiver<bool>,
|
stop_signal: watch::Receiver<bool>,
|
||||||
stop_signal_worker: watch::Receiver<bool>,
|
|
||||||
worker: Box<dyn Worker>,
|
worker: Box<dyn Worker>,
|
||||||
state: WorkerState,
|
state: WorkerState,
|
||||||
errors: usize,
|
errors: usize,
|
||||||
|
@ -225,33 +212,19 @@ impl WorkerHandler {
|
||||||
},
|
},
|
||||||
WorkerState::Throttled(delay) => {
|
WorkerState::Throttled(delay) => {
|
||||||
// Sleep for given delay and go back to busy state
|
// Sleep for given delay and go back to busy state
|
||||||
if !*self.stop_signal.borrow() {
|
select! {
|
||||||
select! {
|
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => {
|
||||||
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (),
|
self.state = WorkerState::Busy;
|
||||||
_ = self.stop_signal.changed() => (),
|
|
||||||
}
|
}
|
||||||
|
_ = self.stop_signal.changed() => (),
|
||||||
}
|
}
|
||||||
self.state = WorkerState::Busy;
|
|
||||||
}
|
}
|
||||||
WorkerState::Idle => {
|
WorkerState::Idle => {
|
||||||
if *self.stop_signal.borrow() {
|
select! {
|
||||||
select! {
|
new_st = self.worker.wait_for_work() => {
|
||||||
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
|
self.state = new_st;
|
||||||
self.state = new_st;
|
|
||||||
}
|
|
||||||
_ = tokio::time::sleep(Duration::from_secs(1)) => {
|
|
||||||
// stay in Idle state
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
select! {
|
|
||||||
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
|
|
||||||
self.state = new_st;
|
|
||||||
}
|
|
||||||
_ = self.stop_signal.changed() => {
|
|
||||||
// stay in Idle state
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
_ = self.stop_signal.changed() => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
WorkerState::Done => unreachable!(),
|
WorkerState::Done => unreachable!(),
|
||||||
|
|
Loading…
Reference in a new issue