WIP big refactoring
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Alex 2021-03-11 16:54:15 +01:00
parent 8d63738cb0
commit 94f3d28774
16 changed files with 387 additions and 325 deletions

3
Cargo.lock generated
View file

@ -1,5 +1,7 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.15" version = "0.7.15"
@ -583,7 +585,6 @@ dependencies = [
name = "garage_table" name = "garage_table"
version = "0.1.1" version = "0.1.1"
dependencies = [ dependencies = [
"arc-swap",
"async-trait", "async-trait",
"bytes 0.4.12", "bytes 0.4.12",
"futures", "futures",

View file

@ -28,38 +28,23 @@ impl Repair {
self.garage self.garage
.bucket_table .bucket_table
.syncer .syncer
.load_full() .add_full_scan();
.unwrap()
.add_full_scan()
.await;
self.garage self.garage
.object_table .object_table
.syncer .syncer
.load_full() .add_full_scan();
.unwrap()
.add_full_scan()
.await;
self.garage self.garage
.version_table .version_table
.syncer .syncer
.load_full() .add_full_scan();
.unwrap()
.add_full_scan()
.await;
self.garage self.garage
.block_ref_table .block_ref_table
.syncer .syncer
.load_full() .add_full_scan();
.unwrap()
.add_full_scan()
.await;
self.garage self.garage
.key_table .key_table
.syncer .syncer
.load_full() .add_full_scan();
.unwrap()
.add_full_scan()
.await;
} }
// TODO: wait for full sync to finish before proceeding to the rest? // TODO: wait for full sync to finish before proceeding to the rest?
@ -93,7 +78,7 @@ impl Repair {
async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> { async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
let mut pos = vec![]; let mut pos = vec![];
while let Some((item_key, item_bytes)) = self.garage.version_table.store.get_gt(&pos)? { while let Some((item_key, item_bytes)) = self.garage.version_table.data.store.get_gt(&pos)? {
pos = item_key.to_vec(); pos = item_key.to_vec();
let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?; let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
@ -141,7 +126,7 @@ impl Repair {
async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> { async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
let mut pos = vec![]; let mut pos = vec![];
while let Some((item_key, item_bytes)) = self.garage.block_ref_table.store.get_gt(&pos)? { while let Some((item_key, item_bytes)) = self.garage.block_ref_table.data.store.get_gt(&pos)? {
pos = item_key.to_vec(); pos = item_key.to_vec();
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?; let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;

View file

@ -19,8 +19,7 @@ use garage_rpc::membership::System;
use garage_rpc::rpc_client::*; use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*; use garage_rpc::rpc_server::*;
use garage_table::table_sharded::TableShardedReplication; use garage_table::replication::{sharded::TableShardedReplication, TableReplication};
use garage_table::TableReplication;
use crate::block_ref_table::*; use crate::block_ref_table::*;
@ -412,7 +411,7 @@ impl BlockManager {
let garage = self.garage.load_full().unwrap(); let garage = self.garage.load_full().unwrap();
let mut last_hash = None; let mut last_hash = None;
let mut i = 0usize; let mut i = 0usize;
for entry in garage.block_ref_table.store.iter() { for entry in garage.block_ref_table.data.store.iter() {
let (_k, v_bytes) = entry?; let (_k, v_bytes) = entry?;
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?; let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?;
if Some(&block_ref.block) == last_hash.as_ref() { if Some(&block_ref.block) == last_hash.as_ref() {

View file

@ -7,8 +7,8 @@ use garage_rpc::membership::System;
use garage_rpc::rpc_client::RpcHttpClient; use garage_rpc::rpc_client::RpcHttpClient;
use garage_rpc::rpc_server::RpcServer; use garage_rpc::rpc_server::RpcServer;
use garage_table::table_fullcopy::*; use garage_table::replication::sharded::*;
use garage_table::table_sharded::*; use garage_table::replication::fullcopy::*;
use garage_table::*; use garage_table::*;
use crate::block::*; use crate::block::*;

View file

@ -6,7 +6,7 @@ use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::*; use garage_table::crdt::*;
use garage_table::table_sharded::*; use garage_table::replication::sharded::*;
use garage_table::*; use garage_table::*;
use crate::version_table::*; use crate::version_table::*;

View file

@ -5,7 +5,7 @@ use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_table::crdt::*; use garage_table::crdt::*;
use garage_table::table_sharded::*; use garage_table::replication::sharded::*;
use garage_table::*; use garage_table::*;
use crate::block_ref_table::*; use crate::block_ref_table::*;

View file

@ -19,7 +19,6 @@ garage_rpc = { version = "0.1.1", path = "../rpc" }
bytes = "0.4" bytes = "0.4"
rand = "0.7" rand = "0.7"
hex = "0.3" hex = "0.3"
arc-swap = "0.4"
log = "0.4" log = "0.4"
hexdump = "0.1" hexdump = "0.1"

189
src/table/data.rs Normal file
View file

@ -0,0 +1,189 @@
use std::sync::Arc;
use log::warn;
use sled::Transactional;
use serde_bytes::ByteBuf;
use garage_util::data::*;
use garage_util::error::*;
use garage_util::background::BackgroundRunner;
use crate::schema::*;
use crate::merkle::*;
use crate::crdt::CRDT;
pub struct TableData<F: TableSchema> {
pub name: String,
pub instance: F,
pub store: sled::Tree,
pub(crate) merkle_updater: Arc<MerkleUpdater>,
}
impl<F> TableData<F> where F: TableSchema {
pub fn new(
name: String,
instance: F,
db: &sled::Db,
background: Arc<BackgroundRunner>,
) -> Arc<Self> {
let store = db
.open_tree(&format!("{}:table", name))
.expect("Unable to open DB tree");
let merkle_todo_store = db
.open_tree(&format!("{}:merkle_todo", name))
.expect("Unable to open DB Merkle TODO tree");
let merkle_tree_store = db
.open_tree(&format!("{}:merkle_tree", name))
.expect("Unable to open DB Merkle tree tree");
let merkle_updater = MerkleUpdater::launch(
name.clone(),
background,
merkle_todo_store,
merkle_tree_store,
);
Arc::new(Self{
name,
instance,
store,
merkle_updater,
})
}
// Read functions
pub fn read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
let tree_key = self.tree_key(p, s);
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
} else {
Ok(None)
}
}
pub fn read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
// Mutation functions
pub(crate) fn update_many(&self, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
for update_bytes in entries.iter() {
self.update_entry(update_bytes.as_slice())?;
}
Ok(())
}
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let changed = (&self.store, &self.merkle_updater.todo).transaction(|(db, mkl_todo)| {
let (old_entry, new_entry) = match db.get(&tree_key)? {
Some(prev_bytes) => {
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
}
None => (None, update.clone()),
};
if Some(&new_entry) != old_entry.as_ref() {
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
mkl_todo.insert(tree_key.clone(), blake2sum(&new_bytes[..]).to_vec())?;
db.insert(tree_key.clone(), new_bytes)?;
Ok(Some((old_entry, new_entry)))
} else {
Ok(None)
}
})?;
if let Some((old_entry, new_entry)) = changed {
self.instance.updated(old_entry, Some(new_entry));
//self.syncer.load_full().unwrap().invalidate(&tree_key[..]);
}
Ok(())
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = (&self.store, &self.merkle_updater.todo).transaction(|(txn, mkl_todo)| {
if let Some(cur_v) = txn.get(k)? {
if cur_v == v {
txn.remove(k)?;
mkl_todo.insert(k, vec![])?;
return Ok(true);
}
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
//self.syncer.load_full().unwrap().invalidate(k);
}
Ok(removed)
}
pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
},
}
}
}

View file

@ -8,9 +8,9 @@ pub mod schema;
pub mod util; pub mod util;
pub mod merkle; pub mod merkle;
pub mod replication;
pub mod data;
pub mod table; pub mod table;
pub mod table_fullcopy;
pub mod table_sharded;
pub mod table_sync; pub mod table_sync;
pub use schema::*; pub use schema::*;

View file

@ -61,7 +61,7 @@ pub enum MerkleNode {
} }
impl MerkleUpdater { impl MerkleUpdater {
pub(crate) fn new( pub(crate) fn launch(
table_name: String, table_name: String,
background: Arc<BackgroundRunner>, background: Arc<BackgroundRunner>,
todo: sled::Tree, todo: sled::Tree,
@ -69,22 +69,22 @@ impl MerkleUpdater {
) -> Arc<Self> { ) -> Arc<Self> {
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]); let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
Arc::new(Self { let ret = Arc::new(Self {
table_name, table_name,
background, background,
todo, todo,
todo_notify: Notify::new(), todo_notify: Notify::new(),
merkle_tree, merkle_tree,
empty_node_hash, empty_node_hash,
}) });
}
pub(crate) fn launch(self: &Arc<Self>) { let ret2 = ret.clone();
let self2 = self.clone(); ret.background.spawn_worker(
self.background.spawn_worker( format!("Merkle tree updater for {}", ret.table_name),
format!("Merkle tree updater for {}", self.table_name), |must_exit: watch::Receiver<bool>| ret2.updater_loop(must_exit),
|must_exit: watch::Receiver<bool>| self2.updater_loop(must_exit),
); );
ret
} }
async fn updater_loop( async fn updater_loop(
@ -156,28 +156,37 @@ impl MerkleUpdater {
new_vhash: Option<Hash>, new_vhash: Option<Hash>,
) -> ConflictableTransactionResult<Option<Hash>, Error> { ) -> ConflictableTransactionResult<Option<Hash>, Error> {
let i = key.prefix.len(); let i = key.prefix.len();
// Read node at current position (defined by the prefix stored in key)
// Calculate an update to apply to this node
// This update is an Option<_>, so that it is None if the update is a no-op
// and we can thus skip recalculating and re-storing everything
let mutate = match self.read_node_txn(tx, &key)? { let mutate = match self.read_node_txn(tx, &key)? {
MerkleNode::Empty => { MerkleNode::Empty => {
if let Some(vhv) = new_vhash { if let Some(vhv) = new_vhash {
Some(MerkleNode::Leaf(k.to_vec(), vhv)) Some(MerkleNode::Leaf(k.to_vec(), vhv))
} else { } else {
// Nothing to do, keep empty node
None None
} }
} }
MerkleNode::Intermediate(mut children) => { MerkleNode::Intermediate(mut children) => {
let key2 = key.next_key(khash); let key2 = key.next_key(khash);
if let Some(subhash) = self.update_item_rec(tx, k, khash, &key2, new_vhash)? { if let Some(subhash) = self.update_item_rec(tx, k, khash, &key2, new_vhash)? {
// Subtree changed, update this node as well
if subhash == self.empty_node_hash { if subhash == self.empty_node_hash {
intermediate_rm_child(&mut children, key2.prefix[i]); intermediate_rm_child(&mut children, key2.prefix[i]);
} else { } else {
intermediate_set_child(&mut children, key2.prefix[i], subhash); intermediate_set_child(&mut children, key2.prefix[i], subhash);
} }
if children.len() == 0 { if children.len() == 0 {
// should not happen // should not happen
warn!("Replacing intermediate node with empty node, should not happen."); warn!("Replacing intermediate node with empty node, should not happen.");
Some(MerkleNode::Empty) Some(MerkleNode::Empty)
} else if children.len() == 1 { } else if children.len() == 1 {
// move node down to this level // We now have a single node (case when the update deleted one of only two
// children). Move that single child to this level of the tree.
let key_sub = key.add_byte(children[0].0); let key_sub = key.add_byte(children[0].0);
let subnode = self.read_node_txn(tx, &key_sub)?; let subnode = self.read_node_txn(tx, &key_sub)?;
tx.remove(key_sub.encode())?; tx.remove(key_sub.encode())?;
@ -186,19 +195,23 @@ impl MerkleUpdater {
Some(MerkleNode::Intermediate(children)) Some(MerkleNode::Intermediate(children))
} }
} else { } else {
// Subtree not changed, nothing to do
None None
} }
} }
MerkleNode::Leaf(exlf_key, exlf_hash) => { MerkleNode::Leaf(exlf_key, exlf_hash) => {
if exlf_key == k { if exlf_key == k {
// This leaf is for the same key that the one we are updating
match new_vhash { match new_vhash {
Some(vhv) if vhv == exlf_hash => None, Some(vhv) if vhv == exlf_hash => None,
Some(vhv) => Some(MerkleNode::Leaf(k.to_vec(), vhv)), Some(vhv) => Some(MerkleNode::Leaf(k.to_vec(), vhv)),
None => Some(MerkleNode::Empty), None => Some(MerkleNode::Empty),
} }
} else { } else {
// This is an only leaf for another key
if let Some(vhv) = new_vhash { if let Some(vhv) = new_vhash {
// Create two sub-nodes and replace by intermediary node // Move that other key to a subnode, create another subnode for our
// insertion and replace current node by an intermediary node
let (pos1, h1) = { let (pos1, h1) = {
let key2 = key.next_key(blake2sum(&exlf_key[..])); let key2 = key.next_key(blake2sum(&exlf_key[..]));
let subhash = let subhash =
@ -216,6 +229,9 @@ impl MerkleUpdater {
intermediate_set_child(&mut int, pos2, h2); intermediate_set_child(&mut int, pos2, h2);
Some(MerkleNode::Intermediate(int)) Some(MerkleNode::Intermediate(int))
} else { } else {
// Nothing to do, we don't want to insert this value because it is None,
// and we don't want to change the other value because it's for something
// else
None None
} }
} }
@ -263,6 +279,7 @@ impl MerkleUpdater {
} }
} }
// Access a node in the Merkle tree, used by the sync protocol
pub(crate) fn read_node( pub(crate) fn read_node(
&self, &self,
k: &MerkleNodeKey, k: &MerkleNodeKey,

View file

@ -4,7 +4,7 @@ use garage_rpc::membership::System;
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
use garage_util::data::*; use garage_util::data::*;
use crate::*; use crate::replication::*;
#[derive(Clone)] #[derive(Clone)]
pub struct TableFullReplication { pub struct TableFullReplication {

View file

@ -0,0 +1,6 @@
mod parameters;
pub mod fullcopy;
pub mod sharded;
pub use parameters::*;

View file

@ -0,0 +1,22 @@
use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_util::data::*;
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn write_quorum(&self, system: &System) -> usize;
fn max_write_errors(&self) -> usize;
// Which are the nodes that do actually replicate the data
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
}

View file

@ -2,7 +2,7 @@ use garage_rpc::membership::System;
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
use garage_util::data::*; use garage_util::data::*;
use crate::*; use crate::replication::*;
#[derive(Clone)] #[derive(Clone)]
pub struct TableShardedReplication { pub struct TableShardedReplication {

View file

@ -2,40 +2,35 @@ use std::collections::{BTreeMap, HashMap};
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use log::warn;
use arc_swap::ArcSwapOption;
use futures::stream::*; use futures::stream::*;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf; use serde_bytes::ByteBuf;
use sled::Transactional;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_rpc::membership::System; use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_rpc::rpc_client::*; use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*; use garage_rpc::rpc_server::*;
use crate::crdt::CRDT; use crate::crdt::CRDT;
use crate::merkle::*; use crate::data::*;
use crate::schema::*; use crate::schema::*;
use crate::table_sync::*; use crate::table_sync::*;
use crate::replication::*;
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
pub struct Table<F: TableSchema, R: TableReplication> { pub struct TableAux<F: TableSchema, R: TableReplication> {
pub instance: F,
pub replication: R,
pub name: String,
pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
pub system: Arc<System>, pub system: Arc<System>,
pub store: sled::Tree, pub replication: R,
pub syncer: ArcSwapOption<TableSyncer<F, R>>, pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
merkle_updater: Arc<MerkleUpdater>, }
pub struct Table<F: TableSchema, R: TableReplication> {
pub data: Arc<TableData<F>>,
pub aux: Arc<TableAux<F, R>>,
pub syncer: Arc<TableSyncer<F, R>>,
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
@ -55,23 +50,6 @@ pub(crate) enum TableRPC<F: TableSchema> {
impl<F: TableSchema> RpcMessage for TableRPC<F> {} impl<F: TableSchema> RpcMessage for TableRPC<F> {}
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn write_quorum(&self, system: &System) -> usize;
fn max_write_errors(&self) -> usize;
// Which are the nodes that do actually replicate the data
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
}
impl<F, R> Table<F, R> impl<F, R> Table<F, R>
where where
@ -88,60 +66,51 @@ where
name: String, name: String,
rpc_server: &mut RpcServer, rpc_server: &mut RpcServer,
) -> Arc<Self> { ) -> Arc<Self> {
let store = db
.open_tree(&format!("{}:table", name))
.expect("Unable to open DB tree");
let merkle_todo_store = db
.open_tree(&format!("{}:merkle_todo", name))
.expect("Unable to open DB Merkle TODO tree");
let merkle_tree_store = db
.open_tree(&format!("{}:merkle_tree", name))
.expect("Unable to open DB Merkle tree tree");
let rpc_path = format!("table_{}", name); let rpc_path = format!("table_{}", name);
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path); let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
let merkle_updater = MerkleUpdater::new( let data = TableData::new(
name.clone(), name,
instance,
db,
system.background.clone(), system.background.clone(),
merkle_todo_store, );
merkle_tree_store,
let aux = Arc::new(TableAux{
system,
replication,
rpc_client,
});
let syncer = TableSyncer::launch(
data.clone(),
aux.clone(),
); );
let table = Arc::new(Self { let table = Arc::new(Self {
instance, data,
replication, aux,
name, syncer,
rpc_client,
system,
store,
syncer: ArcSwapOption::from(None),
merkle_updater,
}); });
table.clone().register_handler(rpc_server, rpc_path); table.clone().register_handler(rpc_server, rpc_path);
let syncer = TableSyncer::launch(table.clone());
table.syncer.swap(Some(syncer));
table.merkle_updater.launch();
table table
} }
pub async fn insert(&self, e: &F::E) -> Result<(), Error> { pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash(); let hash = e.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system); let who = self.aux.replication.write_nodes(&hash, &self.aux.system);
//eprintln!("insert who: {:?}", who); //eprintln!("insert who: {:?}", who);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?)); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
let rpc = TableRPC::<F>::Update(vec![e_enc]); let rpc = TableRPC::<F>::Update(vec![e_enc]);
self.rpc_client self.aux.rpc_client
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system)) RequestStrategy::with_quorum(self.aux.replication.write_quorum(&self.aux.system))
.with_timeout(TABLE_RPC_TIMEOUT), .with_timeout(TABLE_RPC_TIMEOUT),
) )
.await?; .await?;
@ -153,7 +122,7 @@ where
for entry in entries.iter() { for entry in entries.iter() {
let hash = entry.partition_key().hash(); let hash = entry.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system); let who = self.aux.replication.write_nodes(&hash, &self.aux.system);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?)); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
for node in who { for node in who {
if !call_list.contains_key(&node) { if !call_list.contains_key(&node) {
@ -166,7 +135,7 @@ where
let call_futures = call_list.drain().map(|(node, entries)| async move { let call_futures = call_list.drain().map(|(node, entries)| async move {
let rpc = TableRPC::<F>::Update(entries); let rpc = TableRPC::<F>::Update(entries);
let resp = self.rpc_client.call(node, rpc, TABLE_RPC_TIMEOUT).await?; let resp = self.aux.rpc_client.call(node, rpc, TABLE_RPC_TIMEOUT).await?;
Ok::<_, Error>((node, resp)) Ok::<_, Error>((node, resp))
}); });
let mut resps = call_futures.collect::<FuturesUnordered<_>>(); let mut resps = call_futures.collect::<FuturesUnordered<_>>();
@ -177,7 +146,7 @@ where
errors.push(e); errors.push(e);
} }
} }
if errors.len() > self.replication.max_write_errors() { if errors.len() > self.aux.replication.max_write_errors() {
Err(Error::Message("Too many errors".into())) Err(Error::Message("Too many errors".into()))
} else { } else {
Ok(()) Ok(())
@ -190,16 +159,17 @@ where
sort_key: &F::S, sort_key: &F::S,
) -> Result<Option<F::E>, Error> { ) -> Result<Option<F::E>, Error> {
let hash = partition_key.hash(); let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system); let who = self.aux.replication.read_nodes(&hash, &self.aux.system);
//eprintln!("get who: {:?}", who); //eprintln!("get who: {:?}", who);
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone()); let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
let resps = self let resps = self
.aux
.rpc_client .rpc_client
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.replication.read_quorum()) RequestStrategy::with_quorum(self.aux.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT) .with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true), .interrupt_after_quorum(true),
) )
@ -210,7 +180,7 @@ where
for resp in resps { for resp in resps {
if let TableRPC::ReadEntryResponse(value) = resp { if let TableRPC::ReadEntryResponse(value) = resp {
if let Some(v_bytes) = value { if let Some(v_bytes) = value {
let v = self.decode_entry(v_bytes.as_slice())?; let v = self.data.decode_entry(v_bytes.as_slice())?;
ret = match ret { ret = match ret {
None => Some(v), None => Some(v),
Some(mut x) => { Some(mut x) => {
@ -230,7 +200,7 @@ where
if not_all_same { if not_all_same {
let self2 = self.clone(); let self2 = self.clone();
let ent2 = ret_entry.clone(); let ent2 = ret_entry.clone();
self.system self.aux.system
.background .background
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await }); .spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
} }
@ -246,16 +216,16 @@ where
limit: usize, limit: usize,
) -> Result<Vec<F::E>, Error> { ) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash(); let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system); let who = self.aux.replication.read_nodes(&hash, &self.aux.system);
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit); let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
let resps = self let resps = self
.rpc_client .aux.rpc_client
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.replication.read_quorum()) RequestStrategy::with_quorum(self.aux.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT) .with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true), .interrupt_after_quorum(true),
) )
@ -266,8 +236,8 @@ where
for resp in resps { for resp in resps {
if let TableRPC::Update(entries) = resp { if let TableRPC::Update(entries) = resp {
for entry_bytes in entries.iter() { for entry_bytes in entries.iter() {
let entry = self.decode_entry(entry_bytes.as_slice())?; let entry = self.data.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.tree_key(entry.partition_key(), entry.sort_key()); let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
match ret.remove(&entry_key) { match ret.remove(&entry_key) {
None => { None => {
ret.insert(entry_key, Some(entry)); ret.insert(entry_key, Some(entry));
@ -287,7 +257,7 @@ where
} }
if !to_repair.is_empty() { if !to_repair.is_empty() {
let self2 = self.clone(); let self2 = self.clone();
self.system.background.spawn_cancellable(async move { self.aux.system.background.spawn_cancellable(async move {
for (_, v) in to_repair.iter_mut() { for (_, v) in to_repair.iter_mut() {
self2.repair_on_read(&who[..], v.take().unwrap()).await?; self2.repair_on_read(&who[..], v.take().unwrap()).await?;
} }
@ -306,7 +276,7 @@ where
async fn repair_on_read(&self, who: &[UUID], what: F::E) -> Result<(), Error> { async fn repair_on_read(&self, who: &[UUID], what: F::E) -> Result<(), Error> {
let what_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(&what)?)); let what_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(&what)?));
self.rpc_client self.aux.rpc_client
.try_call_many( .try_call_many(
&who[..], &who[..],
TableRPC::<F>::Update(vec![what_enc]), TableRPC::<F>::Update(vec![what_enc]),
@ -326,8 +296,8 @@ where
}); });
let self2 = self.clone(); let self2 = self.clone();
self.rpc_client self.aux.rpc_client
.set_local_handler(self.system.id, move |msg| { .set_local_handler(self.aux.system.id, move |msg| {
let self2 = self2.clone(); let self2 = self2.clone();
async move { self2.handle(&msg).await } async move { self2.handle(&msg).await }
}); });
@ -336,157 +306,24 @@ where
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> { async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
match msg { match msg {
TableRPC::ReadEntry(key, sort_key) => { TableRPC::ReadEntry(key, sort_key) => {
let value = self.handle_read_entry(key, sort_key)?; let value = self.data.read_entry(key, sort_key)?;
Ok(TableRPC::ReadEntryResponse(value)) Ok(TableRPC::ReadEntryResponse(value))
} }
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => { TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
let values = self.handle_read_range(key, begin_sort_key, filter, *limit)?; let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
Ok(TableRPC::Update(values)) Ok(TableRPC::Update(values))
} }
TableRPC::Update(pairs) => { TableRPC::Update(pairs) => {
self.handle_update(pairs)?; self.data.update_many(pairs)?;
Ok(TableRPC::Ok) Ok(TableRPC::Ok)
} }
TableRPC::SyncRPC(rpc) => { TableRPC::SyncRPC(rpc) => {
let syncer = self.syncer.load_full().unwrap(); let response = self.syncer
let response = syncer .handle_rpc(rpc, self.aux.system.background.stop_signal.clone())
.handle_rpc(rpc, self.system.background.stop_signal.clone())
.await?; .await?;
Ok(TableRPC::SyncRPC(response)) Ok(TableRPC::SyncRPC(response))
} }
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))), _ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
} }
} }
fn handle_read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
let tree_key = self.tree_key(p, s);
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
} else {
Ok(None)
}
}
fn handle_read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
// ========== CODE THAT ACTUALLY MODIFIES THE TREE ================
pub fn handle_update(self: &Arc<Self>, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
for update_bytes in entries.iter() {
self.update_entry(update_bytes.as_slice())?;
}
Ok(())
}
pub(crate) fn update_entry(self: &Arc<Self>, update_bytes: &[u8]) -> Result<(), Error> {
let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let changed = (&self.store, &self.merkle_updater.todo).transaction(|(db, mkl_todo)| {
let (old_entry, new_entry) = match db.get(&tree_key)? {
Some(prev_bytes) => {
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
}
None => (None, update.clone()),
};
if Some(&new_entry) != old_entry.as_ref() {
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
mkl_todo.insert(tree_key.clone(), blake2sum(&new_bytes[..]).to_vec())?;
db.insert(tree_key.clone(), new_bytes)?;
Ok(Some((old_entry, new_entry)))
} else {
Ok(None)
}
})?;
if let Some((old_entry, new_entry)) = changed {
self.instance.updated(old_entry, Some(new_entry));
self.syncer.load_full().unwrap().invalidate(&tree_key[..]);
}
Ok(())
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = (&self.store, &self.merkle_updater.todo).transaction(|(txn, mkl_todo)| {
if let Some(cur_v) = txn.get(k)? {
if cur_v == v {
txn.remove(k)?;
mkl_todo.insert(k, vec![])?;
return Ok(true);
}
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.syncer.load_full().unwrap().invalidate(k);
}
Ok(removed)
}
fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
},
}
}
} }

View file

@ -16,18 +16,22 @@ use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use crate::*; use crate::*;
use crate::data::*;
use crate::replication::*;
const MAX_DEPTH: usize = 16; const MAX_DEPTH: usize = 16;
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30); const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
// Scan & sync every 12 hours // Do anti-entropy every 10 minutes
const SCAN_INTERVAL: Duration = Duration::from_secs(12 * 60 * 60); const SCAN_INTERVAL: Duration = Duration::from_secs(10 * 60);
// Expire cache after 30 minutes const CHECKSUM_CACHE_TIMEOUT: Duration = Duration::from_secs(10 * 60);
const CHECKSUM_CACHE_TIMEOUT: Duration = Duration::from_secs(30 * 60);
pub struct TableSyncer<F: TableSchema, R: TableReplication> { pub struct TableSyncer<F: TableSchema, R: TableReplication> {
table: Arc<Table<F, R>>, data: Arc<TableData<F>>,
aux: Arc<TableAux<F, R>>,
todo: Mutex<SyncTodo>, todo: Mutex<SyncTodo>,
cache: Vec<Mutex<BTreeMap<SyncRange, RangeChecksumCache>>>, cache: Vec<Mutex<BTreeMap<SyncRange, RangeChecksumCache>>>,
} }
@ -106,10 +110,13 @@ where
F: TableSchema + 'static, F: TableSchema + 'static,
R: TableReplication + 'static, R: TableReplication + 'static,
{ {
pub(crate) fn launch(table: Arc<Table<F, R>>) -> Arc<Self> { pub(crate) fn launch(data: Arc<TableData<F>>,
let todo = SyncTodo { todo: Vec::new() }; aux: Arc<TableAux<F, R>>) -> Arc<Self> {
let syncer = Arc::new(TableSyncer { let todo = SyncTodo{ todo: vec![] };
table: table.clone(),
let syncer = Arc::new(Self {
data: data.clone(),
aux: aux.clone(),
todo: Mutex::new(todo), todo: Mutex::new(todo),
cache: (0..MAX_DEPTH) cache: (0..MAX_DEPTH)
.map(|_| Mutex::new(BTreeMap::new())) .map(|_| Mutex::new(BTreeMap::new()))
@ -119,21 +126,21 @@ where
let (busy_tx, busy_rx) = mpsc::unbounded_channel(); let (busy_tx, busy_rx) = mpsc::unbounded_channel();
let s1 = syncer.clone(); let s1 = syncer.clone();
table.system.background.spawn_worker( aux.system.background.spawn_worker(
format!("table sync watcher for {}", table.name), format!("table sync watcher for {}", data.name),
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx), move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
); );
let s2 = syncer.clone(); let s2 = syncer.clone();
table.system.background.spawn_worker( aux.system.background.spawn_worker(
format!("table syncer for {}", table.name), format!("table syncer for {}", data.name),
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx), move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
); );
let s3 = syncer.clone(); let s3 = syncer.clone();
tokio::spawn(async move { tokio::spawn(async move {
tokio::time::delay_for(Duration::from_secs(20)).await; tokio::time::delay_for(Duration::from_secs(20)).await;
s3.add_full_scan().await; s3.add_full_scan();
}); });
syncer syncer
@ -144,8 +151,8 @@ where
mut must_exit: watch::Receiver<bool>, mut must_exit: watch::Receiver<bool>,
mut busy_rx: mpsc::UnboundedReceiver<bool>, mut busy_rx: mpsc::UnboundedReceiver<bool>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut prev_ring: Arc<Ring> = self.table.system.ring.borrow().clone(); let mut prev_ring: Arc<Ring> = self.aux.system.ring.borrow().clone();
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.table.system.ring.clone(); let mut ring_recv: watch::Receiver<Arc<Ring>> = self.aux.system.ring.clone();
let mut nothing_to_do_since = Some(Instant::now()); let mut nothing_to_do_since = Some(Instant::now());
while !*must_exit.borrow() { while !*must_exit.borrow() {
@ -158,8 +165,8 @@ where
select! { select! {
new_ring_r = s_ring_recv => { new_ring_r = s_ring_recv => {
if let Some(new_ring) = new_ring_r { if let Some(new_ring) = new_ring_r {
debug!("({}) Adding ring difference to syncer todo list", self.table.name); debug!("({}) Adding ring difference to syncer todo list", self.data.name);
self.todo.lock().unwrap().add_ring_difference(&self.table, &prev_ring, &new_ring); self.todo.lock().unwrap().add_ring_difference(&prev_ring, &new_ring, &self.data, &self.aux);
prev_ring = new_ring; prev_ring = new_ring;
} }
} }
@ -182,8 +189,8 @@ where
_ = s_timeout => { _ = s_timeout => {
if nothing_to_do_since.map(|t| Instant::now() - t >= SCAN_INTERVAL).unwrap_or(false) { if nothing_to_do_since.map(|t| Instant::now() - t >= SCAN_INTERVAL).unwrap_or(false) {
nothing_to_do_since = None; nothing_to_do_since = None;
debug!("({}) Adding full scan to syncer todo list", self.table.name); debug!("({}) Adding full scan to syncer todo list", self.data.name);
self.add_full_scan().await; self.add_full_scan();
} }
} }
} }
@ -191,8 +198,8 @@ where
Ok(()) Ok(())
} }
pub async fn add_full_scan(&self) { pub fn add_full_scan(&self) {
self.todo.lock().unwrap().add_full_scan(&self.table); self.todo.lock().unwrap().add_full_scan(&self.data, &self.aux);
} }
async fn syncer_task( async fn syncer_task(
@ -211,7 +218,7 @@ where
if let Err(e) = res { if let Err(e) = res {
warn!( warn!(
"({}) Error while syncing {:?}: {}", "({}) Error while syncing {:?}: {}",
self.table.name, partition, e self.data.name, partition, e
); );
} }
} else { } else {
@ -228,18 +235,18 @@ where
must_exit: &mut watch::Receiver<bool>, must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> { ) -> Result<(), Error> {
if partition.retain { if partition.retain {
let my_id = self.table.system.id; let my_id = self.aux.system.id;
let nodes = self let nodes = self
.table .aux
.replication .replication
.write_nodes(&partition.begin, &self.table.system) .write_nodes(&partition.begin, &self.aux.system)
.into_iter() .into_iter()
.filter(|node| *node != my_id) .filter(|node| *node != my_id)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
debug!( debug!(
"({}) Preparing to sync {:?} with {:?}...", "({}) Preparing to sync {:?} with {:?}...",
self.table.name, partition, nodes self.data.name, partition, nodes
); );
let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?; let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
@ -259,10 +266,10 @@ where
while let Some(r) = sync_futures.next().await { while let Some(r) = sync_futures.next().await {
if let Err(e) = r { if let Err(e) = r {
n_errors += 1; n_errors += 1;
warn!("({}) Sync error: {}", self.table.name, e); warn!("({}) Sync error: {}", self.data.name, e);
} }
} }
if n_errors > self.table.replication.max_write_errors() { if n_errors > self.aux.replication.max_write_errors() {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"Sync failed with too many nodes (should have been: {:?}).", "Sync failed with too many nodes (should have been: {:?}).",
nodes nodes
@ -293,7 +300,7 @@ where
while !*must_exit.borrow() { while !*must_exit.borrow() {
let mut items = Vec::new(); let mut items = Vec::new();
for item in self.table.store.range(begin.to_vec()..end.to_vec()) { for item in self.data.store.range(begin.to_vec()..end.to_vec()) {
let (key, value) = item?; let (key, value) = item?;
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref())))); items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
@ -304,12 +311,12 @@ where
if items.len() > 0 { if items.len() > 0 {
let nodes = self let nodes = self
.table .aux
.replication .replication
.write_nodes(&begin, &self.table.system) .write_nodes(&begin, &self.aux.system)
.into_iter() .into_iter()
.collect::<Vec<_>>(); .collect::<Vec<_>>();
if nodes.contains(&self.table.system.id) { if nodes.contains(&self.aux.system.id) {
warn!("Interrupting offload as partitions seem to have changed"); warn!("Interrupting offload as partitions seem to have changed");
break; break;
} }
@ -340,7 +347,7 @@ where
let update_msg = Arc::new(TableRPC::<F>::Update(values)); let update_msg = Arc::new(TableRPC::<F>::Update(values));
for res in join_all(nodes.iter().map(|to| { for res in join_all(nodes.iter().map(|to| {
self.table self.aux
.rpc_client .rpc_client
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT) .call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
})) }))
@ -352,7 +359,7 @@ where
// All remote nodes have written those items, now we can delete them locally // All remote nodes have written those items, now we can delete them locally
let mut not_removed = 0; let mut not_removed = 0;
for (k, v) in items.iter() { for (k, v) in items.iter() {
if !self.table.delete_if_equal(&k[..], &v[..])? { if !self.data.delete_if_equal(&k[..], &v[..])? {
not_removed += 1; not_removed += 1;
} }
} }
@ -399,7 +406,7 @@ where
if range.level == 1 { if range.level == 1 {
let mut children = vec![]; let mut children = vec![];
for item in self for item in self
.table .data
.store .store
.range(range.begin.clone()..range.end.clone()) .range(range.begin.clone()..range.end.clone())
{ {
@ -516,7 +523,7 @@ where
let v = self.range_checksum(&range, must_exit)?; let v = self.range_checksum(&range, must_exit)?;
trace!( trace!(
"({}) New checksum calculated for {}-{}/{}, {} children", "({}) New checksum calculated for {}-{}/{}, {} children",
self.table.name, self.data.name,
hex::encode(&range.begin) hex::encode(&range.begin)
.chars() .chars()
.take(16) .take(16)
@ -553,7 +560,7 @@ where
// If their root checksum has level > than us, use that as a reference // If their root checksum has level > than us, use that as a reference
let root_cks_resp = self let root_cks_resp = self
.table .aux
.rpc_client .rpc_client
.call( .call(
who, who,
@ -582,7 +589,7 @@ where
let total_children = todo.iter().map(|x| x.children.len()).fold(0, |x, y| x + y); let total_children = todo.iter().map(|x| x.children.len()).fold(0, |x, y| x + y);
trace!( trace!(
"({}) Sync with {:?}: {} ({}) remaining", "({}) Sync with {:?}: {} ({}) remaining",
self.table.name, self.data.name,
who, who,
todo.len(), todo.len(),
total_children total_children
@ -592,7 +599,7 @@ where
let step = todo.drain(..step_size).collect::<Vec<_>>(); let step = todo.drain(..step_size).collect::<Vec<_>>();
let rpc_resp = self let rpc_resp = self
.table .aux
.rpc_client .rpc_client
.call( .call(
who, who,
@ -606,7 +613,7 @@ where
if diff_ranges.len() > 0 || diff_items.len() > 0 { if diff_ranges.len() > 0 || diff_items.len() > 0 {
info!( info!(
"({}) Sync with {:?}: difference {} ranges, {} items", "({}) Sync with {:?}: difference {} ranges, {} items",
self.table.name, self.data.name,
who, who,
diff_ranges.len(), diff_ranges.len(),
diff_items.len() diff_items.len()
@ -622,7 +629,7 @@ where
} }
} }
if diff_items.len() > 0 { if diff_items.len() > 0 {
self.table.handle_update(&diff_items[..])?; self.data.update_many(&diff_items[..])?;
} }
if items_to_send.len() > 0 { if items_to_send.len() > 0 {
self.send_items(who, items_to_send).await?; self.send_items(who, items_to_send).await?;
@ -640,19 +647,19 @@ where
async fn send_items(&self, who: UUID, item_list: Vec<Vec<u8>>) -> Result<(), Error> { async fn send_items(&self, who: UUID, item_list: Vec<Vec<u8>>) -> Result<(), Error> {
info!( info!(
"({}) Sending {} items to {:?}", "({}) Sending {} items to {:?}",
self.table.name, self.data.name,
item_list.len(), item_list.len(),
who who
); );
let mut values = vec![]; let mut values = vec![];
for item in item_list.iter() { for item in item_list.iter() {
if let Some(v) = self.table.store.get(&item[..])? { if let Some(v) = self.data.store.get(&item[..])? {
values.push(Arc::new(ByteBuf::from(v.as_ref()))); values.push(Arc::new(ByteBuf::from(v.as_ref())));
} }
} }
let rpc_resp = self let rpc_resp = self
.table .aux
.rpc_client .rpc_client
.call(who, TableRPC::<F>::Update(values), TABLE_SYNC_RPC_TIMEOUT) .call(who, TableRPC::<F>::Update(values), TABLE_SYNC_RPC_TIMEOUT)
.await?; .await?;
@ -714,7 +721,7 @@ where
ret_ranges.push(their_range.clone()); ret_ranges.push(their_range.clone());
if their_range.level == 0 { if their_range.level == 0 {
if let Some(item_bytes) = if let Some(item_bytes) =
self.table.store.get(their_range.begin.as_slice())? self.data.store.get(their_range.begin.as_slice())?
{ {
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec()))); ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
} }
@ -738,7 +745,7 @@ where
} }
if our_range.level == 0 { if our_range.level == 0 {
if let Some(item_bytes) = if let Some(item_bytes) =
self.table.store.get(our_range.begin.as_slice())? self.data.store.get(our_range.begin.as_slice())?
{ {
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec()))); ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
} }
@ -753,7 +760,7 @@ where
if ret_ranges.len() > 0 || ret_items.len() > 0 { if ret_ranges.len() > 0 || ret_items.len() > 0 {
trace!( trace!(
"({}) Checksum comparison RPC: {} different + {} items for {} received", "({}) Checksum comparison RPC: {} different + {} items for {} received",
self.table.name, self.data.name,
ret_ranges.len(), ret_ranges.len(),
ret_items.len(), ret_items.len(),
n_checksums n_checksums
@ -782,13 +789,13 @@ where
} }
impl SyncTodo { impl SyncTodo {
fn add_full_scan<F: TableSchema, R: TableReplication>(&mut self, table: &Table<F, R>) { fn add_full_scan<F: TableSchema, R: TableReplication>(&mut self, data: &TableData<F>, aux: &TableAux<F, R>) {
let my_id = table.system.id; let my_id = aux.system.id;
self.todo.clear(); self.todo.clear();
let ring = table.system.ring.borrow().clone(); let ring = aux.system.ring.borrow().clone();
let split_points = table.replication.split_points(&ring); let split_points = aux.replication.split_points(&ring);
for i in 0..split_points.len() - 1 { for i in 0..split_points.len() - 1 {
let begin = split_points[i]; let begin = split_points[i];
@ -797,12 +804,12 @@ impl SyncTodo {
continue; continue;
} }
let nodes = table.replication.replication_nodes(&begin, &ring); let nodes = aux.replication.replication_nodes(&begin, &ring);
let retain = nodes.contains(&my_id); let retain = nodes.contains(&my_id);
if !retain { if !retain {
// Check if we have some data to send, otherwise skip // Check if we have some data to send, otherwise skip
if table.store.range(begin..end).next().is_none() { if data.store.range(begin..end).next().is_none() {
continue; continue;
} }
} }
@ -813,25 +820,25 @@ impl SyncTodo {
fn add_ring_difference<F: TableSchema, R: TableReplication>( fn add_ring_difference<F: TableSchema, R: TableReplication>(
&mut self, &mut self,
table: &Table<F, R>,
old_ring: &Ring, old_ring: &Ring,
new_ring: &Ring, new_ring: &Ring,
data: &TableData<F>, aux: &TableAux<F, R>,
) { ) {
let my_id = table.system.id; let my_id = aux.system.id;
// If it is us who are entering or leaving the system, // If it is us who are entering or leaving the system,
// initiate a full sync instead of incremental sync // initiate a full sync instead of incremental sync
if old_ring.config.members.contains_key(&my_id) if old_ring.config.members.contains_key(&my_id)
!= new_ring.config.members.contains_key(&my_id) != new_ring.config.members.contains_key(&my_id)
{ {
self.add_full_scan(table); self.add_full_scan(data, aux);
return; return;
} }
let mut all_points = None let mut all_points = None
.into_iter() .into_iter()
.chain(table.replication.split_points(old_ring).drain(..)) .chain(aux.replication.split_points(old_ring).drain(..))
.chain(table.replication.split_points(new_ring).drain(..)) .chain(aux.replication.split_points(new_ring).drain(..))
.chain(self.todo.iter().map(|x| x.begin)) .chain(self.todo.iter().map(|x| x.begin))
.chain(self.todo.iter().map(|x| x.end)) .chain(self.todo.iter().map(|x| x.end))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
@ -845,11 +852,11 @@ impl SyncTodo {
for i in 0..all_points.len() - 1 { for i in 0..all_points.len() - 1 {
let begin = all_points[i]; let begin = all_points[i];
let end = all_points[i + 1]; let end = all_points[i + 1];
let was_ours = table let was_ours = aux
.replication .replication
.replication_nodes(&begin, &old_ring) .replication_nodes(&begin, &old_ring)
.contains(&my_id); .contains(&my_id);
let is_ours = table let is_ours = aux
.replication .replication
.replication_nodes(&begin, &new_ring) .replication_nodes(&begin, &new_ring)
.contains(&my_id); .contains(&my_id);