Several changes in table_sync:
- separate path for case of offloading a partition we don't store - use sync::Mutex instead of tokio::Mutex, make less fn's async
This commit is contained in:
parent
40763fd749
commit
55156cca9d
2 changed files with 191 additions and 119 deletions
|
@ -435,7 +435,7 @@ where
|
||||||
let syncer = self.syncer.load_full().unwrap();
|
let syncer = self.syncer.load_full().unwrap();
|
||||||
|
|
||||||
debug!("({}) Deleting range {:?} - {:?}", self.name, begin, end);
|
debug!("({}) Deleting range {:?} - {:?}", self.name, begin, end);
|
||||||
let mut count = 0;
|
let mut count: usize = 0;
|
||||||
while let Some((key, _value)) = self.store.get_lt(end.as_slice())? {
|
while let Some((key, _value)) = self.store.get_lt(end.as_slice())? {
|
||||||
if key.as_ref() < begin.as_slice() {
|
if key.as_ref() < begin.as_slice() {
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -1,15 +1,14 @@
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use std::collections::{BTreeMap, VecDeque};
|
use std::collections::{BTreeMap, VecDeque};
|
||||||
use std::sync::Arc;
|
use std::sync::{Arc, Mutex};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use futures::future::BoxFuture;
|
use futures::future::join_all;
|
||||||
use futures::{pin_mut, select};
|
use futures::{pin_mut, select};
|
||||||
use futures_util::future::*;
|
use futures_util::future::*;
|
||||||
use futures_util::stream::*;
|
use futures_util::stream::*;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_bytes::ByteBuf;
|
use serde_bytes::ByteBuf;
|
||||||
use tokio::sync::Mutex;
|
|
||||||
use tokio::sync::{mpsc, watch};
|
use tokio::sync::{mpsc, watch};
|
||||||
|
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
|
@ -33,7 +32,7 @@ pub struct TableSyncer<F: TableSchema, R: TableReplication> {
|
||||||
pub enum SyncRPC {
|
pub enum SyncRPC {
|
||||||
GetRootChecksumRange(Hash, Hash),
|
GetRootChecksumRange(Hash, Hash),
|
||||||
RootChecksumRange(SyncRange),
|
RootChecksumRange(SyncRange),
|
||||||
Checksums(Vec<RangeChecksum>, bool),
|
Checksums(Vec<RangeChecksum>),
|
||||||
Difference(Vec<SyncRange>, Vec<Arc<ByteBuf>>),
|
Difference(Vec<SyncRange>, Vec<Arc<ByteBuf>>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,8 +42,11 @@ pub struct SyncTodo {
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct TodoPartition {
|
struct TodoPartition {
|
||||||
|
// Partition consists in hashes between begin included and end excluded
|
||||||
begin: Hash,
|
begin: Hash,
|
||||||
end: Hash,
|
end: Hash,
|
||||||
|
|
||||||
|
// Are we a node that stores this partition or not?
|
||||||
retain: bool,
|
retain: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,7 +163,7 @@ where
|
||||||
new_ring_r = s_ring_recv => {
|
new_ring_r = s_ring_recv => {
|
||||||
if let Some(new_ring) = new_ring_r {
|
if let Some(new_ring) = new_ring_r {
|
||||||
debug!("({}) Adding ring difference to syncer todo list", self.table.name);
|
debug!("({}) Adding ring difference to syncer todo list", self.table.name);
|
||||||
self.todo.lock().await.add_ring_difference(&self.table, &prev_ring, &new_ring);
|
self.todo.lock().unwrap().add_ring_difference(&self.table, &prev_ring, &new_ring);
|
||||||
prev_ring = new_ring;
|
prev_ring = new_ring;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -194,7 +196,7 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn add_full_scan(&self) {
|
pub async fn add_full_scan(&self) {
|
||||||
self.todo.lock().await.add_full_scan(&self.table);
|
self.todo.lock().unwrap().add_full_scan(&self.table);
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn syncer_task(
|
async fn syncer_task(
|
||||||
|
@ -203,7 +205,8 @@ where
|
||||||
busy_tx: mpsc::UnboundedSender<bool>,
|
busy_tx: mpsc::UnboundedSender<bool>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
if let Some(partition) = self.todo.lock().await.pop_task() {
|
let task = self.todo.lock().unwrap().pop_task();
|
||||||
|
if let Some(partition) = task {
|
||||||
busy_tx.send(true)?;
|
busy_tx.send(true)?;
|
||||||
let res = self
|
let res = self
|
||||||
.clone()
|
.clone()
|
||||||
|
@ -228,6 +231,7 @@ where
|
||||||
partition: &TodoPartition,
|
partition: &TodoPartition,
|
||||||
must_exit: &mut watch::Receiver<bool>,
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
|
if partition.retain {
|
||||||
let my_id = self.table.system.id;
|
let my_id = self.table.system.id;
|
||||||
let nodes = self
|
let nodes = self
|
||||||
.table
|
.table
|
||||||
|
@ -241,9 +245,7 @@ where
|
||||||
"({}) Preparing to sync {:?} with {:?}...",
|
"({}) Preparing to sync {:?} with {:?}...",
|
||||||
self.table.name, partition, nodes
|
self.table.name, partition, nodes
|
||||||
);
|
);
|
||||||
let root_cks = self
|
let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
|
||||||
.root_checksum(&partition.begin, &partition.end, must_exit)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let mut sync_futures = nodes
|
let mut sync_futures = nodes
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -252,7 +254,6 @@ where
|
||||||
partition.clone(),
|
partition.clone(),
|
||||||
root_cks.clone(),
|
root_cks.clone(),
|
||||||
*node,
|
*node,
|
||||||
partition.retain,
|
|
||||||
must_exit.clone(),
|
must_exit.clone(),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
@ -271,33 +272,111 @@ where
|
||||||
nodes
|
nodes
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
if !partition.retain {
|
self.offload_partition(&partition.begin, &partition.end, must_exit)
|
||||||
self.table
|
|
||||||
.delete_range(&partition.begin, &partition.end)
|
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn root_checksum(
|
// Offload partition: this partition is not something we are storing,
|
||||||
|
// so send it out to all other nodes that store it and delete items locally.
|
||||||
|
// We don't bother checking if the remote nodes already have the items,
|
||||||
|
// we just batch-send everything. Offloading isn't supposed to happen very often.
|
||||||
|
// If any of the nodes that are supposed to store the items is unable to
|
||||||
|
// save them, we interrupt the process.
|
||||||
|
async fn offload_partition(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
begin: &Hash,
|
||||||
|
end: &Hash,
|
||||||
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let mut counter: usize = 0;
|
||||||
|
|
||||||
|
while !*must_exit.borrow() {
|
||||||
|
let mut items = Vec::new();
|
||||||
|
|
||||||
|
for item in self.table.store.range(begin.to_vec()..end.to_vec()) {
|
||||||
|
let (key, value) = item?;
|
||||||
|
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
|
||||||
|
|
||||||
|
if items.len() >= 1024 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if items.len() > 0 {
|
||||||
|
let nodes = self
|
||||||
|
.table
|
||||||
|
.replication
|
||||||
|
.write_nodes(&begin, &self.table.system)
|
||||||
|
.into_iter()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
if nodes.contains(&self.table.system.id) {
|
||||||
|
warn!("Interrupting offload as partitions seem to have changed");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
counter += 1;
|
||||||
|
debug!("Offloading items from {:?}..{:?} ({})", begin, end, counter);
|
||||||
|
self.offload_items(&items, &nodes[..]).await?;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn offload_items(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
items: &Vec<(Vec<u8>, Arc<ByteBuf>)>,
|
||||||
|
nodes: &[UUID],
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
|
||||||
|
let update_msg = Arc::new(TableRPC::<F>::Update(values));
|
||||||
|
|
||||||
|
for res in join_all(nodes.iter().map(|to| {
|
||||||
|
self.table
|
||||||
|
.rpc_client
|
||||||
|
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
|
||||||
|
}))
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
res?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// All remote nodes have written those items, now we can delete them locally
|
||||||
|
for (k, v) in items.iter() {
|
||||||
|
self.table.store.transaction(|tx_db| {
|
||||||
|
if let Some(curv) = tx_db.get(k)? {
|
||||||
|
if curv == &v[..] {
|
||||||
|
tx_db.remove(&k[..])?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn root_checksum(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
begin: &Hash,
|
begin: &Hash,
|
||||||
end: &Hash,
|
end: &Hash,
|
||||||
must_exit: &mut watch::Receiver<bool>,
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
) -> Result<RangeChecksum, Error> {
|
) -> Result<RangeChecksum, Error> {
|
||||||
for i in 1..MAX_DEPTH {
|
for i in 1..MAX_DEPTH {
|
||||||
let rc = self
|
let rc = self.range_checksum(
|
||||||
.range_checksum(
|
|
||||||
&SyncRange {
|
&SyncRange {
|
||||||
begin: begin.to_vec(),
|
begin: begin.to_vec(),
|
||||||
end: end.to_vec(),
|
end: end.to_vec(),
|
||||||
level: i,
|
level: i,
|
||||||
},
|
},
|
||||||
must_exit,
|
must_exit,
|
||||||
)
|
)?;
|
||||||
.await?;
|
|
||||||
if rc.found_limit.is_none() {
|
if rc.found_limit.is_none() {
|
||||||
return Ok(rc);
|
return Ok(rc);
|
||||||
}
|
}
|
||||||
|
@ -307,7 +386,7 @@ where
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn range_checksum(
|
fn range_checksum(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
range: &SyncRange,
|
range: &SyncRange,
|
||||||
must_exit: &mut watch::Receiver<bool>,
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
|
@ -357,9 +436,7 @@ where
|
||||||
};
|
};
|
||||||
let mut time = Instant::now();
|
let mut time = Instant::now();
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
let sub_ck = self
|
let sub_ck = self.range_checksum_cached_hash(&sub_range, must_exit)?;
|
||||||
.range_checksum_cached_hash(&sub_range, must_exit)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if let Some(hash) = sub_ck.hash {
|
if let Some(hash) = sub_ck.hash {
|
||||||
children.push((sub_range.clone(), hash));
|
children.push((sub_range.clone(), hash));
|
||||||
|
@ -397,22 +474,22 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn range_checksum_cached_hash<'a>(
|
fn range_checksum_cached_hash(
|
||||||
self: &'a Arc<Self>,
|
self: &Arc<Self>,
|
||||||
range: &'a SyncRange,
|
range: &SyncRange,
|
||||||
must_exit: &'a mut watch::Receiver<bool>,
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
) -> BoxFuture<'a, Result<RangeChecksumCache, Error>> {
|
) -> Result<RangeChecksumCache, Error> {
|
||||||
async move {
|
{
|
||||||
let mut cache = self.cache[range.level].lock().await;
|
let mut cache = self.cache[range.level].lock().unwrap();
|
||||||
if let Some(v) = cache.get(&range) {
|
if let Some(v) = cache.get(&range) {
|
||||||
if Instant::now() - v.time < CHECKSUM_CACHE_TIMEOUT {
|
if Instant::now() - v.time < CHECKSUM_CACHE_TIMEOUT {
|
||||||
return Ok(v.clone());
|
return Ok(v.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cache.remove(&range);
|
cache.remove(&range);
|
||||||
drop(cache);
|
}
|
||||||
|
|
||||||
let v = self.range_checksum(&range, must_exit).await?;
|
let v = self.range_checksum(&range, must_exit)?;
|
||||||
trace!(
|
trace!(
|
||||||
"({}) New checksum calculated for {}-{}/{}, {} children",
|
"({}) New checksum calculated for {}-{}/{}, {} children",
|
||||||
self.table.name,
|
self.table.name,
|
||||||
|
@ -436,19 +513,16 @@ where
|
||||||
time: v.time,
|
time: v.time,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut cache = self.cache[range.level].lock().await;
|
let mut cache = self.cache[range.level].lock().unwrap();
|
||||||
cache.insert(range.clone(), cache_entry.clone());
|
cache.insert(range.clone(), cache_entry.clone());
|
||||||
Ok(cache_entry)
|
Ok(cache_entry)
|
||||||
}
|
}
|
||||||
.boxed()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn do_sync_with(
|
async fn do_sync_with(
|
||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
partition: TodoPartition,
|
partition: TodoPartition,
|
||||||
root_ck: RangeChecksum,
|
root_ck: RangeChecksum,
|
||||||
who: UUID,
|
who: UUID,
|
||||||
retain: bool,
|
|
||||||
mut must_exit: watch::Receiver<bool>,
|
mut must_exit: watch::Receiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut todo = VecDeque::new();
|
let mut todo = VecDeque::new();
|
||||||
|
@ -468,7 +542,7 @@ where
|
||||||
.await?;
|
.await?;
|
||||||
if let TableRPC::<F>::SyncRPC(SyncRPC::RootChecksumRange(range)) = root_cks_resp {
|
if let TableRPC::<F>::SyncRPC(SyncRPC::RootChecksumRange(range)) = root_cks_resp {
|
||||||
if range.level > root_ck.bounds.level {
|
if range.level > root_ck.bounds.level {
|
||||||
let their_root_range_ck = self.range_checksum(&range, &mut must_exit).await?;
|
let their_root_range_ck = self.range_checksum(&range, &mut must_exit)?;
|
||||||
todo.push_back(their_root_range_ck);
|
todo.push_back(their_root_range_ck);
|
||||||
} else {
|
} else {
|
||||||
todo.push_back(root_ck);
|
todo.push_back(root_ck);
|
||||||
|
@ -498,7 +572,7 @@ where
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.call(
|
.call(
|
||||||
who,
|
who,
|
||||||
TableRPC::<F>::SyncRPC(SyncRPC::Checksums(step, retain)),
|
TableRPC::<F>::SyncRPC(SyncRPC::Checksums(step)),
|
||||||
TABLE_SYNC_RPC_TIMEOUT,
|
TABLE_SYNC_RPC_TIMEOUT,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -519,11 +593,11 @@ where
|
||||||
if differing.level == 0 {
|
if differing.level == 0 {
|
||||||
items_to_send.push(differing.begin);
|
items_to_send.push(differing.begin);
|
||||||
} else {
|
} else {
|
||||||
let checksum = self.range_checksum(&differing, &mut must_exit).await?;
|
let checksum = self.range_checksum(&differing, &mut must_exit)?;
|
||||||
todo.push_back(checksum);
|
todo.push_back(checksum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if retain && diff_items.len() > 0 {
|
if diff_items.len() > 0 {
|
||||||
self.table.handle_update(&diff_items[..]).await?;
|
self.table.handle_update(&diff_items[..]).await?;
|
||||||
}
|
}
|
||||||
if items_to_send.len() > 0 {
|
if items_to_send.len() > 0 {
|
||||||
|
@ -575,11 +649,11 @@ where
|
||||||
) -> Result<SyncRPC, Error> {
|
) -> Result<SyncRPC, Error> {
|
||||||
match message {
|
match message {
|
||||||
SyncRPC::GetRootChecksumRange(begin, end) => {
|
SyncRPC::GetRootChecksumRange(begin, end) => {
|
||||||
let root_cks = self.root_checksum(&begin, &end, &mut must_exit).await?;
|
let root_cks = self.root_checksum(&begin, &end, &mut must_exit)?;
|
||||||
Ok(SyncRPC::RootChecksumRange(root_cks.bounds))
|
Ok(SyncRPC::RootChecksumRange(root_cks.bounds))
|
||||||
}
|
}
|
||||||
SyncRPC::Checksums(checksums, retain) => {
|
SyncRPC::Checksums(checksums) => {
|
||||||
self.handle_checksums_rpc(&checksums[..], *retain, &mut must_exit)
|
self.handle_checksums_rpc(&checksums[..], &mut must_exit)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
_ => Err(Error::Message(format!("Unexpected sync RPC"))),
|
_ => Err(Error::Message(format!("Unexpected sync RPC"))),
|
||||||
|
@ -589,14 +663,13 @@ where
|
||||||
async fn handle_checksums_rpc(
|
async fn handle_checksums_rpc(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
checksums: &[RangeChecksum],
|
checksums: &[RangeChecksum],
|
||||||
retain: bool,
|
|
||||||
must_exit: &mut watch::Receiver<bool>,
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
) -> Result<SyncRPC, Error> {
|
) -> Result<SyncRPC, Error> {
|
||||||
let mut ret_ranges = vec![];
|
let mut ret_ranges = vec![];
|
||||||
let mut ret_items = vec![];
|
let mut ret_items = vec![];
|
||||||
|
|
||||||
for their_ckr in checksums.iter() {
|
for their_ckr in checksums.iter() {
|
||||||
let our_ckr = self.range_checksum(&their_ckr.bounds, must_exit).await?;
|
let our_ckr = self.range_checksum(&their_ckr.bounds, must_exit)?;
|
||||||
for (their_range, their_hash) in their_ckr.children.iter() {
|
for (their_range, their_hash) in their_ckr.children.iter() {
|
||||||
let differs = match our_ckr
|
let differs = match our_ckr
|
||||||
.children
|
.children
|
||||||
|
@ -604,9 +677,8 @@ where
|
||||||
{
|
{
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
if their_range.level >= 1 {
|
if their_range.level >= 1 {
|
||||||
let cached_hash = self
|
let cached_hash =
|
||||||
.range_checksum_cached_hash(&their_range, must_exit)
|
self.range_checksum_cached_hash(&their_range, must_exit)?;
|
||||||
.await?;
|
|
||||||
cached_hash.hash.map(|h| h != *their_hash).unwrap_or(true)
|
cached_hash.hash.map(|h| h != *their_hash).unwrap_or(true)
|
||||||
} else {
|
} else {
|
||||||
true
|
true
|
||||||
|
@ -616,7 +688,7 @@ where
|
||||||
};
|
};
|
||||||
if differs {
|
if differs {
|
||||||
ret_ranges.push(their_range.clone());
|
ret_ranges.push(their_range.clone());
|
||||||
if retain && their_range.level == 0 {
|
if their_range.level == 0 {
|
||||||
if let Some(item_bytes) =
|
if let Some(item_bytes) =
|
||||||
self.table.store.get(their_range.begin.as_slice())?
|
self.table.store.get(their_range.begin.as_slice())?
|
||||||
{
|
{
|
||||||
|
@ -640,7 +712,7 @@ where
|
||||||
if our_range.level > 0 {
|
if our_range.level > 0 {
|
||||||
ret_ranges.push(our_range.clone());
|
ret_ranges.push(our_range.clone());
|
||||||
}
|
}
|
||||||
if retain && our_range.level == 0 {
|
if our_range.level == 0 {
|
||||||
if let Some(item_bytes) =
|
if let Some(item_bytes) =
|
||||||
self.table.store.get(our_range.begin.as_slice())?
|
self.table.store.get(our_range.begin.as_slice())?
|
||||||
{
|
{
|
||||||
|
@ -673,7 +745,7 @@ where
|
||||||
end: vec![],
|
end: vec![],
|
||||||
level: i,
|
level: i,
|
||||||
};
|
};
|
||||||
let mut cache = self.cache[i].lock().await;
|
let mut cache = self.cache[i].lock().unwrap();
|
||||||
if let Some(cache_entry) = cache.range(..=needle).rev().next() {
|
if let Some(cache_entry) = cache.range(..=needle).rev().next() {
|
||||||
if cache_entry.0.begin <= item_key && cache_entry.0.end > item_key {
|
if cache_entry.0.begin <= item_key && cache_entry.0.end > item_key {
|
||||||
let index = cache_entry.0.clone();
|
let index = cache_entry.0.clone();
|
||||||
|
|
Loading…
Reference in a new issue