WIP big refactoring
This commit is contained in:
parent
8d63738cb0
commit
94f3d28774
16 changed files with 387 additions and 325 deletions
3
Cargo.lock
generated
3
Cargo.lock
generated
|
@ -1,5 +1,7 @@
|
||||||
# This file is automatically @generated by Cargo.
|
# This file is automatically @generated by Cargo.
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
|
version = 3
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "0.7.15"
|
version = "0.7.15"
|
||||||
|
@ -583,7 +585,6 @@ dependencies = [
|
||||||
name = "garage_table"
|
name = "garage_table"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arc-swap",
|
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes 0.4.12",
|
"bytes 0.4.12",
|
||||||
"futures",
|
"futures",
|
||||||
|
|
|
@ -28,38 +28,23 @@ impl Repair {
|
||||||
self.garage
|
self.garage
|
||||||
.bucket_table
|
.bucket_table
|
||||||
.syncer
|
.syncer
|
||||||
.load_full()
|
.add_full_scan();
|
||||||
.unwrap()
|
|
||||||
.add_full_scan()
|
|
||||||
.await;
|
|
||||||
self.garage
|
self.garage
|
||||||
.object_table
|
.object_table
|
||||||
.syncer
|
.syncer
|
||||||
.load_full()
|
.add_full_scan();
|
||||||
.unwrap()
|
|
||||||
.add_full_scan()
|
|
||||||
.await;
|
|
||||||
self.garage
|
self.garage
|
||||||
.version_table
|
.version_table
|
||||||
.syncer
|
.syncer
|
||||||
.load_full()
|
.add_full_scan();
|
||||||
.unwrap()
|
|
||||||
.add_full_scan()
|
|
||||||
.await;
|
|
||||||
self.garage
|
self.garage
|
||||||
.block_ref_table
|
.block_ref_table
|
||||||
.syncer
|
.syncer
|
||||||
.load_full()
|
.add_full_scan();
|
||||||
.unwrap()
|
|
||||||
.add_full_scan()
|
|
||||||
.await;
|
|
||||||
self.garage
|
self.garage
|
||||||
.key_table
|
.key_table
|
||||||
.syncer
|
.syncer
|
||||||
.load_full()
|
.add_full_scan();
|
||||||
.unwrap()
|
|
||||||
.add_full_scan()
|
|
||||||
.await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: wait for full sync to finish before proceeding to the rest?
|
// TODO: wait for full sync to finish before proceeding to the rest?
|
||||||
|
@ -93,7 +78,7 @@ impl Repair {
|
||||||
async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
|
async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
|
||||||
let mut pos = vec![];
|
let mut pos = vec![];
|
||||||
|
|
||||||
while let Some((item_key, item_bytes)) = self.garage.version_table.store.get_gt(&pos)? {
|
while let Some((item_key, item_bytes)) = self.garage.version_table.data.store.get_gt(&pos)? {
|
||||||
pos = item_key.to_vec();
|
pos = item_key.to_vec();
|
||||||
|
|
||||||
let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
|
let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
|
||||||
|
@ -141,7 +126,7 @@ impl Repair {
|
||||||
async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
|
async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
|
||||||
let mut pos = vec![];
|
let mut pos = vec![];
|
||||||
|
|
||||||
while let Some((item_key, item_bytes)) = self.garage.block_ref_table.store.get_gt(&pos)? {
|
while let Some((item_key, item_bytes)) = self.garage.block_ref_table.data.store.get_gt(&pos)? {
|
||||||
pos = item_key.to_vec();
|
pos = item_key.to_vec();
|
||||||
|
|
||||||
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;
|
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;
|
||||||
|
|
|
@ -19,8 +19,7 @@ use garage_rpc::membership::System;
|
||||||
use garage_rpc::rpc_client::*;
|
use garage_rpc::rpc_client::*;
|
||||||
use garage_rpc::rpc_server::*;
|
use garage_rpc::rpc_server::*;
|
||||||
|
|
||||||
use garage_table::table_sharded::TableShardedReplication;
|
use garage_table::replication::{sharded::TableShardedReplication, TableReplication};
|
||||||
use garage_table::TableReplication;
|
|
||||||
|
|
||||||
use crate::block_ref_table::*;
|
use crate::block_ref_table::*;
|
||||||
|
|
||||||
|
@ -412,7 +411,7 @@ impl BlockManager {
|
||||||
let garage = self.garage.load_full().unwrap();
|
let garage = self.garage.load_full().unwrap();
|
||||||
let mut last_hash = None;
|
let mut last_hash = None;
|
||||||
let mut i = 0usize;
|
let mut i = 0usize;
|
||||||
for entry in garage.block_ref_table.store.iter() {
|
for entry in garage.block_ref_table.data.store.iter() {
|
||||||
let (_k, v_bytes) = entry?;
|
let (_k, v_bytes) = entry?;
|
||||||
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?;
|
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?;
|
||||||
if Some(&block_ref.block) == last_hash.as_ref() {
|
if Some(&block_ref.block) == last_hash.as_ref() {
|
||||||
|
|
|
@ -7,8 +7,8 @@ use garage_rpc::membership::System;
|
||||||
use garage_rpc::rpc_client::RpcHttpClient;
|
use garage_rpc::rpc_client::RpcHttpClient;
|
||||||
use garage_rpc::rpc_server::RpcServer;
|
use garage_rpc::rpc_server::RpcServer;
|
||||||
|
|
||||||
use garage_table::table_fullcopy::*;
|
use garage_table::replication::sharded::*;
|
||||||
use garage_table::table_sharded::*;
|
use garage_table::replication::fullcopy::*;
|
||||||
use garage_table::*;
|
use garage_table::*;
|
||||||
|
|
||||||
use crate::block::*;
|
use crate::block::*;
|
||||||
|
|
|
@ -6,7 +6,7 @@ use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use garage_table::crdt::*;
|
use garage_table::crdt::*;
|
||||||
use garage_table::table_sharded::*;
|
use garage_table::replication::sharded::*;
|
||||||
use garage_table::*;
|
use garage_table::*;
|
||||||
|
|
||||||
use crate::version_table::*;
|
use crate::version_table::*;
|
||||||
|
|
|
@ -5,7 +5,7 @@ use garage_util::background::BackgroundRunner;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use garage_table::crdt::*;
|
use garage_table::crdt::*;
|
||||||
use garage_table::table_sharded::*;
|
use garage_table::replication::sharded::*;
|
||||||
use garage_table::*;
|
use garage_table::*;
|
||||||
|
|
||||||
use crate::block_ref_table::*;
|
use crate::block_ref_table::*;
|
||||||
|
|
|
@ -19,7 +19,6 @@ garage_rpc = { version = "0.1.1", path = "../rpc" }
|
||||||
bytes = "0.4"
|
bytes = "0.4"
|
||||||
rand = "0.7"
|
rand = "0.7"
|
||||||
hex = "0.3"
|
hex = "0.3"
|
||||||
arc-swap = "0.4"
|
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
hexdump = "0.1"
|
hexdump = "0.1"
|
||||||
|
|
||||||
|
|
189
src/table/data.rs
Normal file
189
src/table/data.rs
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use log::warn;
|
||||||
|
use sled::Transactional;
|
||||||
|
use serde_bytes::ByteBuf;
|
||||||
|
|
||||||
|
use garage_util::data::*;
|
||||||
|
use garage_util::error::*;
|
||||||
|
use garage_util::background::BackgroundRunner;
|
||||||
|
|
||||||
|
use crate::schema::*;
|
||||||
|
use crate::merkle::*;
|
||||||
|
use crate::crdt::CRDT;
|
||||||
|
|
||||||
|
pub struct TableData<F: TableSchema> {
|
||||||
|
pub name: String,
|
||||||
|
pub instance: F,
|
||||||
|
|
||||||
|
pub store: sled::Tree,
|
||||||
|
pub(crate) merkle_updater: Arc<MerkleUpdater>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<F> TableData<F> where F: TableSchema {
|
||||||
|
pub fn new(
|
||||||
|
name: String,
|
||||||
|
instance: F,
|
||||||
|
db: &sled::Db,
|
||||||
|
background: Arc<BackgroundRunner>,
|
||||||
|
) -> Arc<Self> {
|
||||||
|
let store = db
|
||||||
|
.open_tree(&format!("{}:table", name))
|
||||||
|
.expect("Unable to open DB tree");
|
||||||
|
|
||||||
|
let merkle_todo_store = db
|
||||||
|
.open_tree(&format!("{}:merkle_todo", name))
|
||||||
|
.expect("Unable to open DB Merkle TODO tree");
|
||||||
|
let merkle_tree_store = db
|
||||||
|
.open_tree(&format!("{}:merkle_tree", name))
|
||||||
|
.expect("Unable to open DB Merkle tree tree");
|
||||||
|
|
||||||
|
let merkle_updater = MerkleUpdater::launch(
|
||||||
|
name.clone(),
|
||||||
|
background,
|
||||||
|
merkle_todo_store,
|
||||||
|
merkle_tree_store,
|
||||||
|
);
|
||||||
|
|
||||||
|
Arc::new(Self{
|
||||||
|
name,
|
||||||
|
instance,
|
||||||
|
store,
|
||||||
|
merkle_updater,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read functions
|
||||||
|
|
||||||
|
pub fn read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
|
||||||
|
let tree_key = self.tree_key(p, s);
|
||||||
|
if let Some(bytes) = self.store.get(&tree_key)? {
|
||||||
|
Ok(Some(ByteBuf::from(bytes.to_vec())))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn read_range(
|
||||||
|
&self,
|
||||||
|
p: &F::P,
|
||||||
|
s: &Option<F::S>,
|
||||||
|
filter: &Option<F::Filter>,
|
||||||
|
limit: usize,
|
||||||
|
) -> Result<Vec<Arc<ByteBuf>>, Error> {
|
||||||
|
let partition_hash = p.hash();
|
||||||
|
let first_key = match s {
|
||||||
|
None => partition_hash.to_vec(),
|
||||||
|
Some(sk) => self.tree_key(p, sk),
|
||||||
|
};
|
||||||
|
let mut ret = vec![];
|
||||||
|
for item in self.store.range(first_key..) {
|
||||||
|
let (key, value) = item?;
|
||||||
|
if &key[..32] != partition_hash.as_slice() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let keep = match filter {
|
||||||
|
None => true,
|
||||||
|
Some(f) => {
|
||||||
|
let entry = self.decode_entry(value.as_ref())?;
|
||||||
|
F::matches_filter(&entry, f)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if keep {
|
||||||
|
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
|
||||||
|
}
|
||||||
|
if ret.len() >= limit {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mutation functions
|
||||||
|
|
||||||
|
pub(crate) fn update_many(&self, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
|
||||||
|
for update_bytes in entries.iter() {
|
||||||
|
self.update_entry(update_bytes.as_slice())?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
|
||||||
|
let update = self.decode_entry(update_bytes)?;
|
||||||
|
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
|
||||||
|
|
||||||
|
let changed = (&self.store, &self.merkle_updater.todo).transaction(|(db, mkl_todo)| {
|
||||||
|
let (old_entry, new_entry) = match db.get(&tree_key)? {
|
||||||
|
Some(prev_bytes) => {
|
||||||
|
let old_entry = self
|
||||||
|
.decode_entry(&prev_bytes)
|
||||||
|
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
|
||||||
|
let mut new_entry = old_entry.clone();
|
||||||
|
new_entry.merge(&update);
|
||||||
|
(Some(old_entry), new_entry)
|
||||||
|
}
|
||||||
|
None => (None, update.clone()),
|
||||||
|
};
|
||||||
|
|
||||||
|
if Some(&new_entry) != old_entry.as_ref() {
|
||||||
|
let new_bytes = rmp_to_vec_all_named(&new_entry)
|
||||||
|
.map_err(Error::RMPEncode)
|
||||||
|
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
|
||||||
|
mkl_todo.insert(tree_key.clone(), blake2sum(&new_bytes[..]).to_vec())?;
|
||||||
|
db.insert(tree_key.clone(), new_bytes)?;
|
||||||
|
Ok(Some((old_entry, new_entry)))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if let Some((old_entry, new_entry)) = changed {
|
||||||
|
self.instance.updated(old_entry, Some(new_entry));
|
||||||
|
//self.syncer.load_full().unwrap().invalidate(&tree_key[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
|
||||||
|
let removed = (&self.store, &self.merkle_updater.todo).transaction(|(txn, mkl_todo)| {
|
||||||
|
if let Some(cur_v) = txn.get(k)? {
|
||||||
|
if cur_v == v {
|
||||||
|
txn.remove(k)?;
|
||||||
|
mkl_todo.insert(k, vec![])?;
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(false)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if removed {
|
||||||
|
let old_entry = self.decode_entry(v)?;
|
||||||
|
self.instance.updated(Some(old_entry), None);
|
||||||
|
//self.syncer.load_full().unwrap().invalidate(k);
|
||||||
|
}
|
||||||
|
Ok(removed)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
|
||||||
|
let mut ret = p.hash().to_vec();
|
||||||
|
ret.extend(s.sort_key());
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
|
||||||
|
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
|
||||||
|
Ok(x) => Ok(x),
|
||||||
|
Err(e) => match F::try_migrate(bytes) {
|
||||||
|
Some(x) => Ok(x),
|
||||||
|
None => {
|
||||||
|
warn!("Unable to decode entry of {}: {}", self.name, e);
|
||||||
|
for line in hexdump::hexdump_iter(bytes) {
|
||||||
|
debug!("{}", line);
|
||||||
|
}
|
||||||
|
Err(e.into())
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -8,9 +8,9 @@ pub mod schema;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
||||||
pub mod merkle;
|
pub mod merkle;
|
||||||
|
pub mod replication;
|
||||||
|
pub mod data;
|
||||||
pub mod table;
|
pub mod table;
|
||||||
pub mod table_fullcopy;
|
|
||||||
pub mod table_sharded;
|
|
||||||
pub mod table_sync;
|
pub mod table_sync;
|
||||||
|
|
||||||
pub use schema::*;
|
pub use schema::*;
|
||||||
|
|
|
@ -61,7 +61,7 @@ pub enum MerkleNode {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MerkleUpdater {
|
impl MerkleUpdater {
|
||||||
pub(crate) fn new(
|
pub(crate) fn launch(
|
||||||
table_name: String,
|
table_name: String,
|
||||||
background: Arc<BackgroundRunner>,
|
background: Arc<BackgroundRunner>,
|
||||||
todo: sled::Tree,
|
todo: sled::Tree,
|
||||||
|
@ -69,22 +69,22 @@ impl MerkleUpdater {
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
|
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
|
||||||
|
|
||||||
Arc::new(Self {
|
let ret = Arc::new(Self {
|
||||||
table_name,
|
table_name,
|
||||||
background,
|
background,
|
||||||
todo,
|
todo,
|
||||||
todo_notify: Notify::new(),
|
todo_notify: Notify::new(),
|
||||||
merkle_tree,
|
merkle_tree,
|
||||||
empty_node_hash,
|
empty_node_hash,
|
||||||
})
|
});
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn launch(self: &Arc<Self>) {
|
let ret2 = ret.clone();
|
||||||
let self2 = self.clone();
|
ret.background.spawn_worker(
|
||||||
self.background.spawn_worker(
|
format!("Merkle tree updater for {}", ret.table_name),
|
||||||
format!("Merkle tree updater for {}", self.table_name),
|
|must_exit: watch::Receiver<bool>| ret2.updater_loop(must_exit),
|
||||||
|must_exit: watch::Receiver<bool>| self2.updater_loop(must_exit),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn updater_loop(
|
async fn updater_loop(
|
||||||
|
@ -156,28 +156,37 @@ impl MerkleUpdater {
|
||||||
new_vhash: Option<Hash>,
|
new_vhash: Option<Hash>,
|
||||||
) -> ConflictableTransactionResult<Option<Hash>, Error> {
|
) -> ConflictableTransactionResult<Option<Hash>, Error> {
|
||||||
let i = key.prefix.len();
|
let i = key.prefix.len();
|
||||||
|
|
||||||
|
// Read node at current position (defined by the prefix stored in key)
|
||||||
|
// Calculate an update to apply to this node
|
||||||
|
// This update is an Option<_>, so that it is None if the update is a no-op
|
||||||
|
// and we can thus skip recalculating and re-storing everything
|
||||||
let mutate = match self.read_node_txn(tx, &key)? {
|
let mutate = match self.read_node_txn(tx, &key)? {
|
||||||
MerkleNode::Empty => {
|
MerkleNode::Empty => {
|
||||||
if let Some(vhv) = new_vhash {
|
if let Some(vhv) = new_vhash {
|
||||||
Some(MerkleNode::Leaf(k.to_vec(), vhv))
|
Some(MerkleNode::Leaf(k.to_vec(), vhv))
|
||||||
} else {
|
} else {
|
||||||
|
// Nothing to do, keep empty node
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MerkleNode::Intermediate(mut children) => {
|
MerkleNode::Intermediate(mut children) => {
|
||||||
let key2 = key.next_key(khash);
|
let key2 = key.next_key(khash);
|
||||||
if let Some(subhash) = self.update_item_rec(tx, k, khash, &key2, new_vhash)? {
|
if let Some(subhash) = self.update_item_rec(tx, k, khash, &key2, new_vhash)? {
|
||||||
|
// Subtree changed, update this node as well
|
||||||
if subhash == self.empty_node_hash {
|
if subhash == self.empty_node_hash {
|
||||||
intermediate_rm_child(&mut children, key2.prefix[i]);
|
intermediate_rm_child(&mut children, key2.prefix[i]);
|
||||||
} else {
|
} else {
|
||||||
intermediate_set_child(&mut children, key2.prefix[i], subhash);
|
intermediate_set_child(&mut children, key2.prefix[i], subhash);
|
||||||
}
|
}
|
||||||
|
|
||||||
if children.len() == 0 {
|
if children.len() == 0 {
|
||||||
// should not happen
|
// should not happen
|
||||||
warn!("Replacing intermediate node with empty node, should not happen.");
|
warn!("Replacing intermediate node with empty node, should not happen.");
|
||||||
Some(MerkleNode::Empty)
|
Some(MerkleNode::Empty)
|
||||||
} else if children.len() == 1 {
|
} else if children.len() == 1 {
|
||||||
// move node down to this level
|
// We now have a single node (case when the update deleted one of only two
|
||||||
|
// children). Move that single child to this level of the tree.
|
||||||
let key_sub = key.add_byte(children[0].0);
|
let key_sub = key.add_byte(children[0].0);
|
||||||
let subnode = self.read_node_txn(tx, &key_sub)?;
|
let subnode = self.read_node_txn(tx, &key_sub)?;
|
||||||
tx.remove(key_sub.encode())?;
|
tx.remove(key_sub.encode())?;
|
||||||
|
@ -186,19 +195,23 @@ impl MerkleUpdater {
|
||||||
Some(MerkleNode::Intermediate(children))
|
Some(MerkleNode::Intermediate(children))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// Subtree not changed, nothing to do
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MerkleNode::Leaf(exlf_key, exlf_hash) => {
|
MerkleNode::Leaf(exlf_key, exlf_hash) => {
|
||||||
if exlf_key == k {
|
if exlf_key == k {
|
||||||
|
// This leaf is for the same key that the one we are updating
|
||||||
match new_vhash {
|
match new_vhash {
|
||||||
Some(vhv) if vhv == exlf_hash => None,
|
Some(vhv) if vhv == exlf_hash => None,
|
||||||
Some(vhv) => Some(MerkleNode::Leaf(k.to_vec(), vhv)),
|
Some(vhv) => Some(MerkleNode::Leaf(k.to_vec(), vhv)),
|
||||||
None => Some(MerkleNode::Empty),
|
None => Some(MerkleNode::Empty),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// This is an only leaf for another key
|
||||||
if let Some(vhv) = new_vhash {
|
if let Some(vhv) = new_vhash {
|
||||||
// Create two sub-nodes and replace by intermediary node
|
// Move that other key to a subnode, create another subnode for our
|
||||||
|
// insertion and replace current node by an intermediary node
|
||||||
let (pos1, h1) = {
|
let (pos1, h1) = {
|
||||||
let key2 = key.next_key(blake2sum(&exlf_key[..]));
|
let key2 = key.next_key(blake2sum(&exlf_key[..]));
|
||||||
let subhash =
|
let subhash =
|
||||||
|
@ -216,6 +229,9 @@ impl MerkleUpdater {
|
||||||
intermediate_set_child(&mut int, pos2, h2);
|
intermediate_set_child(&mut int, pos2, h2);
|
||||||
Some(MerkleNode::Intermediate(int))
|
Some(MerkleNode::Intermediate(int))
|
||||||
} else {
|
} else {
|
||||||
|
// Nothing to do, we don't want to insert this value because it is None,
|
||||||
|
// and we don't want to change the other value because it's for something
|
||||||
|
// else
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -263,6 +279,7 @@ impl MerkleUpdater {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Access a node in the Merkle tree, used by the sync protocol
|
||||||
pub(crate) fn read_node(
|
pub(crate) fn read_node(
|
||||||
&self,
|
&self,
|
||||||
k: &MerkleNodeKey,
|
k: &MerkleNodeKey,
|
||||||
|
|
|
@ -4,7 +4,7 @@ use garage_rpc::membership::System;
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use crate::*;
|
use crate::replication::*;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TableFullReplication {
|
pub struct TableFullReplication {
|
6
src/table/replication/mod.rs
Normal file
6
src/table/replication/mod.rs
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
mod parameters;
|
||||||
|
|
||||||
|
pub mod fullcopy;
|
||||||
|
pub mod sharded;
|
||||||
|
|
||||||
|
pub use parameters::*;
|
22
src/table/replication/parameters.rs
Normal file
22
src/table/replication/parameters.rs
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
use garage_rpc::membership::System;
|
||||||
|
use garage_rpc::ring::Ring;
|
||||||
|
|
||||||
|
use garage_util::data::*;
|
||||||
|
|
||||||
|
pub trait TableReplication: Send + Sync {
|
||||||
|
// See examples in table_sharded.rs and table_fullcopy.rs
|
||||||
|
// To understand various replication methods
|
||||||
|
|
||||||
|
// Which nodes to send reads from
|
||||||
|
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
|
||||||
|
fn read_quorum(&self) -> usize;
|
||||||
|
|
||||||
|
// Which nodes to send writes to
|
||||||
|
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
|
||||||
|
fn write_quorum(&self, system: &System) -> usize;
|
||||||
|
fn max_write_errors(&self) -> usize;
|
||||||
|
|
||||||
|
// Which are the nodes that do actually replicate the data
|
||||||
|
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
|
||||||
|
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
|
||||||
|
}
|
|
@ -2,7 +2,7 @@ use garage_rpc::membership::System;
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
use crate::*;
|
use crate::replication::*;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TableShardedReplication {
|
pub struct TableShardedReplication {
|
|
@ -2,40 +2,35 @@ use std::collections::{BTreeMap, HashMap};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use log::warn;
|
|
||||||
|
|
||||||
use arc_swap::ArcSwapOption;
|
|
||||||
use futures::stream::*;
|
use futures::stream::*;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_bytes::ByteBuf;
|
use serde_bytes::ByteBuf;
|
||||||
use sled::Transactional;
|
|
||||||
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::Error;
|
||||||
|
|
||||||
use garage_rpc::membership::System;
|
use garage_rpc::membership::System;
|
||||||
use garage_rpc::ring::Ring;
|
|
||||||
use garage_rpc::rpc_client::*;
|
use garage_rpc::rpc_client::*;
|
||||||
use garage_rpc::rpc_server::*;
|
use garage_rpc::rpc_server::*;
|
||||||
|
|
||||||
use crate::crdt::CRDT;
|
use crate::crdt::CRDT;
|
||||||
use crate::merkle::*;
|
use crate::data::*;
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
use crate::table_sync::*;
|
use crate::table_sync::*;
|
||||||
|
use crate::replication::*;
|
||||||
|
|
||||||
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
|
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
pub struct Table<F: TableSchema, R: TableReplication> {
|
pub struct TableAux<F: TableSchema, R: TableReplication> {
|
||||||
pub instance: F,
|
|
||||||
pub replication: R,
|
|
||||||
|
|
||||||
pub name: String,
|
|
||||||
pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
|
|
||||||
|
|
||||||
pub system: Arc<System>,
|
pub system: Arc<System>,
|
||||||
pub store: sled::Tree,
|
pub replication: R,
|
||||||
pub syncer: ArcSwapOption<TableSyncer<F, R>>,
|
pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
|
||||||
merkle_updater: Arc<MerkleUpdater>,
|
}
|
||||||
|
|
||||||
|
pub struct Table<F: TableSchema, R: TableReplication> {
|
||||||
|
pub data: Arc<TableData<F>>,
|
||||||
|
pub aux: Arc<TableAux<F, R>>,
|
||||||
|
pub syncer: Arc<TableSyncer<F, R>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
|
@ -55,23 +50,6 @@ pub(crate) enum TableRPC<F: TableSchema> {
|
||||||
|
|
||||||
impl<F: TableSchema> RpcMessage for TableRPC<F> {}
|
impl<F: TableSchema> RpcMessage for TableRPC<F> {}
|
||||||
|
|
||||||
pub trait TableReplication: Send + Sync {
|
|
||||||
// See examples in table_sharded.rs and table_fullcopy.rs
|
|
||||||
// To understand various replication methods
|
|
||||||
|
|
||||||
// Which nodes to send reads from
|
|
||||||
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
|
|
||||||
fn read_quorum(&self) -> usize;
|
|
||||||
|
|
||||||
// Which nodes to send writes to
|
|
||||||
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
|
|
||||||
fn write_quorum(&self, system: &System) -> usize;
|
|
||||||
fn max_write_errors(&self) -> usize;
|
|
||||||
|
|
||||||
// Which are the nodes that do actually replicate the data
|
|
||||||
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
|
|
||||||
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<F, R> Table<F, R>
|
impl<F, R> Table<F, R>
|
||||||
where
|
where
|
||||||
|
@ -88,60 +66,51 @@ where
|
||||||
name: String,
|
name: String,
|
||||||
rpc_server: &mut RpcServer,
|
rpc_server: &mut RpcServer,
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
let store = db
|
|
||||||
.open_tree(&format!("{}:table", name))
|
|
||||||
.expect("Unable to open DB tree");
|
|
||||||
|
|
||||||
let merkle_todo_store = db
|
|
||||||
.open_tree(&format!("{}:merkle_todo", name))
|
|
||||||
.expect("Unable to open DB Merkle TODO tree");
|
|
||||||
let merkle_tree_store = db
|
|
||||||
.open_tree(&format!("{}:merkle_tree", name))
|
|
||||||
.expect("Unable to open DB Merkle tree tree");
|
|
||||||
|
|
||||||
let rpc_path = format!("table_{}", name);
|
let rpc_path = format!("table_{}", name);
|
||||||
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
|
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
|
||||||
|
|
||||||
let merkle_updater = MerkleUpdater::new(
|
let data = TableData::new(
|
||||||
name.clone(),
|
name,
|
||||||
|
instance,
|
||||||
|
db,
|
||||||
system.background.clone(),
|
system.background.clone(),
|
||||||
merkle_todo_store,
|
);
|
||||||
merkle_tree_store,
|
|
||||||
|
let aux = Arc::new(TableAux{
|
||||||
|
system,
|
||||||
|
replication,
|
||||||
|
rpc_client,
|
||||||
|
});
|
||||||
|
|
||||||
|
let syncer = TableSyncer::launch(
|
||||||
|
data.clone(),
|
||||||
|
aux.clone(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let table = Arc::new(Self {
|
let table = Arc::new(Self {
|
||||||
instance,
|
data,
|
||||||
replication,
|
aux,
|
||||||
name,
|
syncer,
|
||||||
rpc_client,
|
|
||||||
system,
|
|
||||||
store,
|
|
||||||
syncer: ArcSwapOption::from(None),
|
|
||||||
merkle_updater,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
table.clone().register_handler(rpc_server, rpc_path);
|
table.clone().register_handler(rpc_server, rpc_path);
|
||||||
|
|
||||||
let syncer = TableSyncer::launch(table.clone());
|
|
||||||
table.syncer.swap(Some(syncer));
|
|
||||||
|
|
||||||
table.merkle_updater.launch();
|
|
||||||
|
|
||||||
table
|
table
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
||||||
let hash = e.partition_key().hash();
|
let hash = e.partition_key().hash();
|
||||||
let who = self.replication.write_nodes(&hash, &self.system);
|
let who = self.aux.replication.write_nodes(&hash, &self.aux.system);
|
||||||
//eprintln!("insert who: {:?}", who);
|
//eprintln!("insert who: {:?}", who);
|
||||||
|
|
||||||
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
|
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
|
||||||
let rpc = TableRPC::<F>::Update(vec![e_enc]);
|
let rpc = TableRPC::<F>::Update(vec![e_enc]);
|
||||||
|
|
||||||
self.rpc_client
|
self.aux.rpc_client
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
&who[..],
|
&who[..],
|
||||||
rpc,
|
rpc,
|
||||||
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system))
|
RequestStrategy::with_quorum(self.aux.replication.write_quorum(&self.aux.system))
|
||||||
.with_timeout(TABLE_RPC_TIMEOUT),
|
.with_timeout(TABLE_RPC_TIMEOUT),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -153,7 +122,7 @@ where
|
||||||
|
|
||||||
for entry in entries.iter() {
|
for entry in entries.iter() {
|
||||||
let hash = entry.partition_key().hash();
|
let hash = entry.partition_key().hash();
|
||||||
let who = self.replication.write_nodes(&hash, &self.system);
|
let who = self.aux.replication.write_nodes(&hash, &self.aux.system);
|
||||||
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
|
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
|
||||||
for node in who {
|
for node in who {
|
||||||
if !call_list.contains_key(&node) {
|
if !call_list.contains_key(&node) {
|
||||||
|
@ -166,7 +135,7 @@ where
|
||||||
let call_futures = call_list.drain().map(|(node, entries)| async move {
|
let call_futures = call_list.drain().map(|(node, entries)| async move {
|
||||||
let rpc = TableRPC::<F>::Update(entries);
|
let rpc = TableRPC::<F>::Update(entries);
|
||||||
|
|
||||||
let resp = self.rpc_client.call(node, rpc, TABLE_RPC_TIMEOUT).await?;
|
let resp = self.aux.rpc_client.call(node, rpc, TABLE_RPC_TIMEOUT).await?;
|
||||||
Ok::<_, Error>((node, resp))
|
Ok::<_, Error>((node, resp))
|
||||||
});
|
});
|
||||||
let mut resps = call_futures.collect::<FuturesUnordered<_>>();
|
let mut resps = call_futures.collect::<FuturesUnordered<_>>();
|
||||||
|
@ -177,7 +146,7 @@ where
|
||||||
errors.push(e);
|
errors.push(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if errors.len() > self.replication.max_write_errors() {
|
if errors.len() > self.aux.replication.max_write_errors() {
|
||||||
Err(Error::Message("Too many errors".into()))
|
Err(Error::Message("Too many errors".into()))
|
||||||
} else {
|
} else {
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -190,16 +159,17 @@ where
|
||||||
sort_key: &F::S,
|
sort_key: &F::S,
|
||||||
) -> Result<Option<F::E>, Error> {
|
) -> Result<Option<F::E>, Error> {
|
||||||
let hash = partition_key.hash();
|
let hash = partition_key.hash();
|
||||||
let who = self.replication.read_nodes(&hash, &self.system);
|
let who = self.aux.replication.read_nodes(&hash, &self.aux.system);
|
||||||
//eprintln!("get who: {:?}", who);
|
//eprintln!("get who: {:?}", who);
|
||||||
|
|
||||||
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
|
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
|
||||||
let resps = self
|
let resps = self
|
||||||
|
.aux
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
&who[..],
|
&who[..],
|
||||||
rpc,
|
rpc,
|
||||||
RequestStrategy::with_quorum(self.replication.read_quorum())
|
RequestStrategy::with_quorum(self.aux.replication.read_quorum())
|
||||||
.with_timeout(TABLE_RPC_TIMEOUT)
|
.with_timeout(TABLE_RPC_TIMEOUT)
|
||||||
.interrupt_after_quorum(true),
|
.interrupt_after_quorum(true),
|
||||||
)
|
)
|
||||||
|
@ -210,7 +180,7 @@ where
|
||||||
for resp in resps {
|
for resp in resps {
|
||||||
if let TableRPC::ReadEntryResponse(value) = resp {
|
if let TableRPC::ReadEntryResponse(value) = resp {
|
||||||
if let Some(v_bytes) = value {
|
if let Some(v_bytes) = value {
|
||||||
let v = self.decode_entry(v_bytes.as_slice())?;
|
let v = self.data.decode_entry(v_bytes.as_slice())?;
|
||||||
ret = match ret {
|
ret = match ret {
|
||||||
None => Some(v),
|
None => Some(v),
|
||||||
Some(mut x) => {
|
Some(mut x) => {
|
||||||
|
@ -230,7 +200,7 @@ where
|
||||||
if not_all_same {
|
if not_all_same {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
let ent2 = ret_entry.clone();
|
let ent2 = ret_entry.clone();
|
||||||
self.system
|
self.aux.system
|
||||||
.background
|
.background
|
||||||
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
|
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
|
||||||
}
|
}
|
||||||
|
@ -246,16 +216,16 @@ where
|
||||||
limit: usize,
|
limit: usize,
|
||||||
) -> Result<Vec<F::E>, Error> {
|
) -> Result<Vec<F::E>, Error> {
|
||||||
let hash = partition_key.hash();
|
let hash = partition_key.hash();
|
||||||
let who = self.replication.read_nodes(&hash, &self.system);
|
let who = self.aux.replication.read_nodes(&hash, &self.aux.system);
|
||||||
|
|
||||||
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
|
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
|
||||||
|
|
||||||
let resps = self
|
let resps = self
|
||||||
.rpc_client
|
.aux.rpc_client
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
&who[..],
|
&who[..],
|
||||||
rpc,
|
rpc,
|
||||||
RequestStrategy::with_quorum(self.replication.read_quorum())
|
RequestStrategy::with_quorum(self.aux.replication.read_quorum())
|
||||||
.with_timeout(TABLE_RPC_TIMEOUT)
|
.with_timeout(TABLE_RPC_TIMEOUT)
|
||||||
.interrupt_after_quorum(true),
|
.interrupt_after_quorum(true),
|
||||||
)
|
)
|
||||||
|
@ -266,8 +236,8 @@ where
|
||||||
for resp in resps {
|
for resp in resps {
|
||||||
if let TableRPC::Update(entries) = resp {
|
if let TableRPC::Update(entries) = resp {
|
||||||
for entry_bytes in entries.iter() {
|
for entry_bytes in entries.iter() {
|
||||||
let entry = self.decode_entry(entry_bytes.as_slice())?;
|
let entry = self.data.decode_entry(entry_bytes.as_slice())?;
|
||||||
let entry_key = self.tree_key(entry.partition_key(), entry.sort_key());
|
let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
|
||||||
match ret.remove(&entry_key) {
|
match ret.remove(&entry_key) {
|
||||||
None => {
|
None => {
|
||||||
ret.insert(entry_key, Some(entry));
|
ret.insert(entry_key, Some(entry));
|
||||||
|
@ -287,7 +257,7 @@ where
|
||||||
}
|
}
|
||||||
if !to_repair.is_empty() {
|
if !to_repair.is_empty() {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
self.system.background.spawn_cancellable(async move {
|
self.aux.system.background.spawn_cancellable(async move {
|
||||||
for (_, v) in to_repair.iter_mut() {
|
for (_, v) in to_repair.iter_mut() {
|
||||||
self2.repair_on_read(&who[..], v.take().unwrap()).await?;
|
self2.repair_on_read(&who[..], v.take().unwrap()).await?;
|
||||||
}
|
}
|
||||||
|
@ -306,7 +276,7 @@ where
|
||||||
|
|
||||||
async fn repair_on_read(&self, who: &[UUID], what: F::E) -> Result<(), Error> {
|
async fn repair_on_read(&self, who: &[UUID], what: F::E) -> Result<(), Error> {
|
||||||
let what_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(&what)?));
|
let what_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(&what)?));
|
||||||
self.rpc_client
|
self.aux.rpc_client
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
&who[..],
|
&who[..],
|
||||||
TableRPC::<F>::Update(vec![what_enc]),
|
TableRPC::<F>::Update(vec![what_enc]),
|
||||||
|
@ -326,8 +296,8 @@ where
|
||||||
});
|
});
|
||||||
|
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
self.rpc_client
|
self.aux.rpc_client
|
||||||
.set_local_handler(self.system.id, move |msg| {
|
.set_local_handler(self.aux.system.id, move |msg| {
|
||||||
let self2 = self2.clone();
|
let self2 = self2.clone();
|
||||||
async move { self2.handle(&msg).await }
|
async move { self2.handle(&msg).await }
|
||||||
});
|
});
|
||||||
|
@ -336,157 +306,24 @@ where
|
||||||
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
|
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
|
||||||
match msg {
|
match msg {
|
||||||
TableRPC::ReadEntry(key, sort_key) => {
|
TableRPC::ReadEntry(key, sort_key) => {
|
||||||
let value = self.handle_read_entry(key, sort_key)?;
|
let value = self.data.read_entry(key, sort_key)?;
|
||||||
Ok(TableRPC::ReadEntryResponse(value))
|
Ok(TableRPC::ReadEntryResponse(value))
|
||||||
}
|
}
|
||||||
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
|
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
|
||||||
let values = self.handle_read_range(key, begin_sort_key, filter, *limit)?;
|
let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
|
||||||
Ok(TableRPC::Update(values))
|
Ok(TableRPC::Update(values))
|
||||||
}
|
}
|
||||||
TableRPC::Update(pairs) => {
|
TableRPC::Update(pairs) => {
|
||||||
self.handle_update(pairs)?;
|
self.data.update_many(pairs)?;
|
||||||
Ok(TableRPC::Ok)
|
Ok(TableRPC::Ok)
|
||||||
}
|
}
|
||||||
TableRPC::SyncRPC(rpc) => {
|
TableRPC::SyncRPC(rpc) => {
|
||||||
let syncer = self.syncer.load_full().unwrap();
|
let response = self.syncer
|
||||||
let response = syncer
|
.handle_rpc(rpc, self.aux.system.background.stop_signal.clone())
|
||||||
.handle_rpc(rpc, self.system.background.stop_signal.clone())
|
|
||||||
.await?;
|
.await?;
|
||||||
Ok(TableRPC::SyncRPC(response))
|
Ok(TableRPC::SyncRPC(response))
|
||||||
}
|
}
|
||||||
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
|
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn handle_read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
|
|
||||||
let tree_key = self.tree_key(p, s);
|
|
||||||
if let Some(bytes) = self.store.get(&tree_key)? {
|
|
||||||
Ok(Some(ByteBuf::from(bytes.to_vec())))
|
|
||||||
} else {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn handle_read_range(
|
|
||||||
&self,
|
|
||||||
p: &F::P,
|
|
||||||
s: &Option<F::S>,
|
|
||||||
filter: &Option<F::Filter>,
|
|
||||||
limit: usize,
|
|
||||||
) -> Result<Vec<Arc<ByteBuf>>, Error> {
|
|
||||||
let partition_hash = p.hash();
|
|
||||||
let first_key = match s {
|
|
||||||
None => partition_hash.to_vec(),
|
|
||||||
Some(sk) => self.tree_key(p, sk),
|
|
||||||
};
|
|
||||||
let mut ret = vec![];
|
|
||||||
for item in self.store.range(first_key..) {
|
|
||||||
let (key, value) = item?;
|
|
||||||
if &key[..32] != partition_hash.as_slice() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let keep = match filter {
|
|
||||||
None => true,
|
|
||||||
Some(f) => {
|
|
||||||
let entry = self.decode_entry(value.as_ref())?;
|
|
||||||
F::matches_filter(&entry, f)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if keep {
|
|
||||||
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
|
|
||||||
}
|
|
||||||
if ret.len() >= limit {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ========== CODE THAT ACTUALLY MODIFIES THE TREE ================
|
|
||||||
|
|
||||||
pub fn handle_update(self: &Arc<Self>, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
|
|
||||||
for update_bytes in entries.iter() {
|
|
||||||
self.update_entry(update_bytes.as_slice())?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn update_entry(self: &Arc<Self>, update_bytes: &[u8]) -> Result<(), Error> {
|
|
||||||
let update = self.decode_entry(update_bytes)?;
|
|
||||||
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
|
|
||||||
|
|
||||||
let changed = (&self.store, &self.merkle_updater.todo).transaction(|(db, mkl_todo)| {
|
|
||||||
let (old_entry, new_entry) = match db.get(&tree_key)? {
|
|
||||||
Some(prev_bytes) => {
|
|
||||||
let old_entry = self
|
|
||||||
.decode_entry(&prev_bytes)
|
|
||||||
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
|
|
||||||
let mut new_entry = old_entry.clone();
|
|
||||||
new_entry.merge(&update);
|
|
||||||
(Some(old_entry), new_entry)
|
|
||||||
}
|
|
||||||
None => (None, update.clone()),
|
|
||||||
};
|
|
||||||
|
|
||||||
if Some(&new_entry) != old_entry.as_ref() {
|
|
||||||
let new_bytes = rmp_to_vec_all_named(&new_entry)
|
|
||||||
.map_err(Error::RMPEncode)
|
|
||||||
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
|
|
||||||
mkl_todo.insert(tree_key.clone(), blake2sum(&new_bytes[..]).to_vec())?;
|
|
||||||
db.insert(tree_key.clone(), new_bytes)?;
|
|
||||||
Ok(Some((old_entry, new_entry)))
|
|
||||||
} else {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if let Some((old_entry, new_entry)) = changed {
|
|
||||||
self.instance.updated(old_entry, Some(new_entry));
|
|
||||||
self.syncer.load_full().unwrap().invalidate(&tree_key[..]);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
|
|
||||||
let removed = (&self.store, &self.merkle_updater.todo).transaction(|(txn, mkl_todo)| {
|
|
||||||
if let Some(cur_v) = txn.get(k)? {
|
|
||||||
if cur_v == v {
|
|
||||||
txn.remove(k)?;
|
|
||||||
mkl_todo.insert(k, vec![])?;
|
|
||||||
return Ok(true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(false)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if removed {
|
|
||||||
let old_entry = self.decode_entry(v)?;
|
|
||||||
self.instance.updated(Some(old_entry), None);
|
|
||||||
self.syncer.load_full().unwrap().invalidate(k);
|
|
||||||
}
|
|
||||||
Ok(removed)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
|
|
||||||
let mut ret = p.hash().to_vec();
|
|
||||||
ret.extend(s.sort_key());
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
|
|
||||||
fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
|
|
||||||
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
|
|
||||||
Ok(x) => Ok(x),
|
|
||||||
Err(e) => match F::try_migrate(bytes) {
|
|
||||||
Some(x) => Ok(x),
|
|
||||||
None => {
|
|
||||||
warn!("Unable to decode entry of {}: {}", self.name, e);
|
|
||||||
for line in hexdump::hexdump_iter(bytes) {
|
|
||||||
debug!("{}", line);
|
|
||||||
}
|
|
||||||
Err(e.into())
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,18 +16,22 @@ use garage_util::data::*;
|
||||||
use garage_util::error::Error;
|
use garage_util::error::Error;
|
||||||
|
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
use crate::data::*;
|
||||||
|
use crate::replication::*;
|
||||||
|
|
||||||
const MAX_DEPTH: usize = 16;
|
const MAX_DEPTH: usize = 16;
|
||||||
|
|
||||||
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
|
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
|
||||||
|
|
||||||
// Scan & sync every 12 hours
|
// Do anti-entropy every 10 minutes
|
||||||
const SCAN_INTERVAL: Duration = Duration::from_secs(12 * 60 * 60);
|
const SCAN_INTERVAL: Duration = Duration::from_secs(10 * 60);
|
||||||
|
|
||||||
// Expire cache after 30 minutes
|
const CHECKSUM_CACHE_TIMEOUT: Duration = Duration::from_secs(10 * 60);
|
||||||
const CHECKSUM_CACHE_TIMEOUT: Duration = Duration::from_secs(30 * 60);
|
|
||||||
|
|
||||||
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
|
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
|
||||||
table: Arc<Table<F, R>>,
|
data: Arc<TableData<F>>,
|
||||||
|
aux: Arc<TableAux<F, R>>,
|
||||||
|
|
||||||
todo: Mutex<SyncTodo>,
|
todo: Mutex<SyncTodo>,
|
||||||
cache: Vec<Mutex<BTreeMap<SyncRange, RangeChecksumCache>>>,
|
cache: Vec<Mutex<BTreeMap<SyncRange, RangeChecksumCache>>>,
|
||||||
}
|
}
|
||||||
|
@ -106,10 +110,13 @@ where
|
||||||
F: TableSchema + 'static,
|
F: TableSchema + 'static,
|
||||||
R: TableReplication + 'static,
|
R: TableReplication + 'static,
|
||||||
{
|
{
|
||||||
pub(crate) fn launch(table: Arc<Table<F, R>>) -> Arc<Self> {
|
pub(crate) fn launch(data: Arc<TableData<F>>,
|
||||||
let todo = SyncTodo { todo: Vec::new() };
|
aux: Arc<TableAux<F, R>>) -> Arc<Self> {
|
||||||
let syncer = Arc::new(TableSyncer {
|
let todo = SyncTodo{ todo: vec![] };
|
||||||
table: table.clone(),
|
|
||||||
|
let syncer = Arc::new(Self {
|
||||||
|
data: data.clone(),
|
||||||
|
aux: aux.clone(),
|
||||||
todo: Mutex::new(todo),
|
todo: Mutex::new(todo),
|
||||||
cache: (0..MAX_DEPTH)
|
cache: (0..MAX_DEPTH)
|
||||||
.map(|_| Mutex::new(BTreeMap::new()))
|
.map(|_| Mutex::new(BTreeMap::new()))
|
||||||
|
@ -119,21 +126,21 @@ where
|
||||||
let (busy_tx, busy_rx) = mpsc::unbounded_channel();
|
let (busy_tx, busy_rx) = mpsc::unbounded_channel();
|
||||||
|
|
||||||
let s1 = syncer.clone();
|
let s1 = syncer.clone();
|
||||||
table.system.background.spawn_worker(
|
aux.system.background.spawn_worker(
|
||||||
format!("table sync watcher for {}", table.name),
|
format!("table sync watcher for {}", data.name),
|
||||||
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
|
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
|
||||||
);
|
);
|
||||||
|
|
||||||
let s2 = syncer.clone();
|
let s2 = syncer.clone();
|
||||||
table.system.background.spawn_worker(
|
aux.system.background.spawn_worker(
|
||||||
format!("table syncer for {}", table.name),
|
format!("table syncer for {}", data.name),
|
||||||
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
|
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
|
||||||
);
|
);
|
||||||
|
|
||||||
let s3 = syncer.clone();
|
let s3 = syncer.clone();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
tokio::time::delay_for(Duration::from_secs(20)).await;
|
tokio::time::delay_for(Duration::from_secs(20)).await;
|
||||||
s3.add_full_scan().await;
|
s3.add_full_scan();
|
||||||
});
|
});
|
||||||
|
|
||||||
syncer
|
syncer
|
||||||
|
@ -144,8 +151,8 @@ where
|
||||||
mut must_exit: watch::Receiver<bool>,
|
mut must_exit: watch::Receiver<bool>,
|
||||||
mut busy_rx: mpsc::UnboundedReceiver<bool>,
|
mut busy_rx: mpsc::UnboundedReceiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut prev_ring: Arc<Ring> = self.table.system.ring.borrow().clone();
|
let mut prev_ring: Arc<Ring> = self.aux.system.ring.borrow().clone();
|
||||||
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.table.system.ring.clone();
|
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.aux.system.ring.clone();
|
||||||
let mut nothing_to_do_since = Some(Instant::now());
|
let mut nothing_to_do_since = Some(Instant::now());
|
||||||
|
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
|
@ -158,8 +165,8 @@ where
|
||||||
select! {
|
select! {
|
||||||
new_ring_r = s_ring_recv => {
|
new_ring_r = s_ring_recv => {
|
||||||
if let Some(new_ring) = new_ring_r {
|
if let Some(new_ring) = new_ring_r {
|
||||||
debug!("({}) Adding ring difference to syncer todo list", self.table.name);
|
debug!("({}) Adding ring difference to syncer todo list", self.data.name);
|
||||||
self.todo.lock().unwrap().add_ring_difference(&self.table, &prev_ring, &new_ring);
|
self.todo.lock().unwrap().add_ring_difference(&prev_ring, &new_ring, &self.data, &self.aux);
|
||||||
prev_ring = new_ring;
|
prev_ring = new_ring;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -182,8 +189,8 @@ where
|
||||||
_ = s_timeout => {
|
_ = s_timeout => {
|
||||||
if nothing_to_do_since.map(|t| Instant::now() - t >= SCAN_INTERVAL).unwrap_or(false) {
|
if nothing_to_do_since.map(|t| Instant::now() - t >= SCAN_INTERVAL).unwrap_or(false) {
|
||||||
nothing_to_do_since = None;
|
nothing_to_do_since = None;
|
||||||
debug!("({}) Adding full scan to syncer todo list", self.table.name);
|
debug!("({}) Adding full scan to syncer todo list", self.data.name);
|
||||||
self.add_full_scan().await;
|
self.add_full_scan();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -191,8 +198,8 @@ where
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn add_full_scan(&self) {
|
pub fn add_full_scan(&self) {
|
||||||
self.todo.lock().unwrap().add_full_scan(&self.table);
|
self.todo.lock().unwrap().add_full_scan(&self.data, &self.aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn syncer_task(
|
async fn syncer_task(
|
||||||
|
@ -211,7 +218,7 @@ where
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
warn!(
|
warn!(
|
||||||
"({}) Error while syncing {:?}: {}",
|
"({}) Error while syncing {:?}: {}",
|
||||||
self.table.name, partition, e
|
self.data.name, partition, e
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -228,18 +235,18 @@ where
|
||||||
must_exit: &mut watch::Receiver<bool>,
|
must_exit: &mut watch::Receiver<bool>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
if partition.retain {
|
if partition.retain {
|
||||||
let my_id = self.table.system.id;
|
let my_id = self.aux.system.id;
|
||||||
let nodes = self
|
let nodes = self
|
||||||
.table
|
.aux
|
||||||
.replication
|
.replication
|
||||||
.write_nodes(&partition.begin, &self.table.system)
|
.write_nodes(&partition.begin, &self.aux.system)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|node| *node != my_id)
|
.filter(|node| *node != my_id)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
"({}) Preparing to sync {:?} with {:?}...",
|
"({}) Preparing to sync {:?} with {:?}...",
|
||||||
self.table.name, partition, nodes
|
self.data.name, partition, nodes
|
||||||
);
|
);
|
||||||
let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
|
let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
|
||||||
|
|
||||||
|
@ -259,10 +266,10 @@ where
|
||||||
while let Some(r) = sync_futures.next().await {
|
while let Some(r) = sync_futures.next().await {
|
||||||
if let Err(e) = r {
|
if let Err(e) = r {
|
||||||
n_errors += 1;
|
n_errors += 1;
|
||||||
warn!("({}) Sync error: {}", self.table.name, e);
|
warn!("({}) Sync error: {}", self.data.name, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if n_errors > self.table.replication.max_write_errors() {
|
if n_errors > self.aux.replication.max_write_errors() {
|
||||||
return Err(Error::Message(format!(
|
return Err(Error::Message(format!(
|
||||||
"Sync failed with too many nodes (should have been: {:?}).",
|
"Sync failed with too many nodes (should have been: {:?}).",
|
||||||
nodes
|
nodes
|
||||||
|
@ -293,7 +300,7 @@ where
|
||||||
while !*must_exit.borrow() {
|
while !*must_exit.borrow() {
|
||||||
let mut items = Vec::new();
|
let mut items = Vec::new();
|
||||||
|
|
||||||
for item in self.table.store.range(begin.to_vec()..end.to_vec()) {
|
for item in self.data.store.range(begin.to_vec()..end.to_vec()) {
|
||||||
let (key, value) = item?;
|
let (key, value) = item?;
|
||||||
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
|
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
|
||||||
|
|
||||||
|
@ -304,12 +311,12 @@ where
|
||||||
|
|
||||||
if items.len() > 0 {
|
if items.len() > 0 {
|
||||||
let nodes = self
|
let nodes = self
|
||||||
.table
|
.aux
|
||||||
.replication
|
.replication
|
||||||
.write_nodes(&begin, &self.table.system)
|
.write_nodes(&begin, &self.aux.system)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
if nodes.contains(&self.table.system.id) {
|
if nodes.contains(&self.aux.system.id) {
|
||||||
warn!("Interrupting offload as partitions seem to have changed");
|
warn!("Interrupting offload as partitions seem to have changed");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -340,7 +347,7 @@ where
|
||||||
let update_msg = Arc::new(TableRPC::<F>::Update(values));
|
let update_msg = Arc::new(TableRPC::<F>::Update(values));
|
||||||
|
|
||||||
for res in join_all(nodes.iter().map(|to| {
|
for res in join_all(nodes.iter().map(|to| {
|
||||||
self.table
|
self.aux
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
|
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
|
||||||
}))
|
}))
|
||||||
|
@ -352,7 +359,7 @@ where
|
||||||
// All remote nodes have written those items, now we can delete them locally
|
// All remote nodes have written those items, now we can delete them locally
|
||||||
let mut not_removed = 0;
|
let mut not_removed = 0;
|
||||||
for (k, v) in items.iter() {
|
for (k, v) in items.iter() {
|
||||||
if !self.table.delete_if_equal(&k[..], &v[..])? {
|
if !self.data.delete_if_equal(&k[..], &v[..])? {
|
||||||
not_removed += 1;
|
not_removed += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -399,7 +406,7 @@ where
|
||||||
if range.level == 1 {
|
if range.level == 1 {
|
||||||
let mut children = vec![];
|
let mut children = vec![];
|
||||||
for item in self
|
for item in self
|
||||||
.table
|
.data
|
||||||
.store
|
.store
|
||||||
.range(range.begin.clone()..range.end.clone())
|
.range(range.begin.clone()..range.end.clone())
|
||||||
{
|
{
|
||||||
|
@ -516,7 +523,7 @@ where
|
||||||
let v = self.range_checksum(&range, must_exit)?;
|
let v = self.range_checksum(&range, must_exit)?;
|
||||||
trace!(
|
trace!(
|
||||||
"({}) New checksum calculated for {}-{}/{}, {} children",
|
"({}) New checksum calculated for {}-{}/{}, {} children",
|
||||||
self.table.name,
|
self.data.name,
|
||||||
hex::encode(&range.begin)
|
hex::encode(&range.begin)
|
||||||
.chars()
|
.chars()
|
||||||
.take(16)
|
.take(16)
|
||||||
|
@ -553,7 +560,7 @@ where
|
||||||
|
|
||||||
// If their root checksum has level > than us, use that as a reference
|
// If their root checksum has level > than us, use that as a reference
|
||||||
let root_cks_resp = self
|
let root_cks_resp = self
|
||||||
.table
|
.aux
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.call(
|
.call(
|
||||||
who,
|
who,
|
||||||
|
@ -582,7 +589,7 @@ where
|
||||||
let total_children = todo.iter().map(|x| x.children.len()).fold(0, |x, y| x + y);
|
let total_children = todo.iter().map(|x| x.children.len()).fold(0, |x, y| x + y);
|
||||||
trace!(
|
trace!(
|
||||||
"({}) Sync with {:?}: {} ({}) remaining",
|
"({}) Sync with {:?}: {} ({}) remaining",
|
||||||
self.table.name,
|
self.data.name,
|
||||||
who,
|
who,
|
||||||
todo.len(),
|
todo.len(),
|
||||||
total_children
|
total_children
|
||||||
|
@ -592,7 +599,7 @@ where
|
||||||
let step = todo.drain(..step_size).collect::<Vec<_>>();
|
let step = todo.drain(..step_size).collect::<Vec<_>>();
|
||||||
|
|
||||||
let rpc_resp = self
|
let rpc_resp = self
|
||||||
.table
|
.aux
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.call(
|
.call(
|
||||||
who,
|
who,
|
||||||
|
@ -606,7 +613,7 @@ where
|
||||||
if diff_ranges.len() > 0 || diff_items.len() > 0 {
|
if diff_ranges.len() > 0 || diff_items.len() > 0 {
|
||||||
info!(
|
info!(
|
||||||
"({}) Sync with {:?}: difference {} ranges, {} items",
|
"({}) Sync with {:?}: difference {} ranges, {} items",
|
||||||
self.table.name,
|
self.data.name,
|
||||||
who,
|
who,
|
||||||
diff_ranges.len(),
|
diff_ranges.len(),
|
||||||
diff_items.len()
|
diff_items.len()
|
||||||
|
@ -622,7 +629,7 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if diff_items.len() > 0 {
|
if diff_items.len() > 0 {
|
||||||
self.table.handle_update(&diff_items[..])?;
|
self.data.update_many(&diff_items[..])?;
|
||||||
}
|
}
|
||||||
if items_to_send.len() > 0 {
|
if items_to_send.len() > 0 {
|
||||||
self.send_items(who, items_to_send).await?;
|
self.send_items(who, items_to_send).await?;
|
||||||
|
@ -640,19 +647,19 @@ where
|
||||||
async fn send_items(&self, who: UUID, item_list: Vec<Vec<u8>>) -> Result<(), Error> {
|
async fn send_items(&self, who: UUID, item_list: Vec<Vec<u8>>) -> Result<(), Error> {
|
||||||
info!(
|
info!(
|
||||||
"({}) Sending {} items to {:?}",
|
"({}) Sending {} items to {:?}",
|
||||||
self.table.name,
|
self.data.name,
|
||||||
item_list.len(),
|
item_list.len(),
|
||||||
who
|
who
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut values = vec![];
|
let mut values = vec![];
|
||||||
for item in item_list.iter() {
|
for item in item_list.iter() {
|
||||||
if let Some(v) = self.table.store.get(&item[..])? {
|
if let Some(v) = self.data.store.get(&item[..])? {
|
||||||
values.push(Arc::new(ByteBuf::from(v.as_ref())));
|
values.push(Arc::new(ByteBuf::from(v.as_ref())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let rpc_resp = self
|
let rpc_resp = self
|
||||||
.table
|
.aux
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.call(who, TableRPC::<F>::Update(values), TABLE_SYNC_RPC_TIMEOUT)
|
.call(who, TableRPC::<F>::Update(values), TABLE_SYNC_RPC_TIMEOUT)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -714,7 +721,7 @@ where
|
||||||
ret_ranges.push(their_range.clone());
|
ret_ranges.push(their_range.clone());
|
||||||
if their_range.level == 0 {
|
if their_range.level == 0 {
|
||||||
if let Some(item_bytes) =
|
if let Some(item_bytes) =
|
||||||
self.table.store.get(their_range.begin.as_slice())?
|
self.data.store.get(their_range.begin.as_slice())?
|
||||||
{
|
{
|
||||||
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
|
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
|
||||||
}
|
}
|
||||||
|
@ -738,7 +745,7 @@ where
|
||||||
}
|
}
|
||||||
if our_range.level == 0 {
|
if our_range.level == 0 {
|
||||||
if let Some(item_bytes) =
|
if let Some(item_bytes) =
|
||||||
self.table.store.get(our_range.begin.as_slice())?
|
self.data.store.get(our_range.begin.as_slice())?
|
||||||
{
|
{
|
||||||
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
|
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
|
||||||
}
|
}
|
||||||
|
@ -753,7 +760,7 @@ where
|
||||||
if ret_ranges.len() > 0 || ret_items.len() > 0 {
|
if ret_ranges.len() > 0 || ret_items.len() > 0 {
|
||||||
trace!(
|
trace!(
|
||||||
"({}) Checksum comparison RPC: {} different + {} items for {} received",
|
"({}) Checksum comparison RPC: {} different + {} items for {} received",
|
||||||
self.table.name,
|
self.data.name,
|
||||||
ret_ranges.len(),
|
ret_ranges.len(),
|
||||||
ret_items.len(),
|
ret_items.len(),
|
||||||
n_checksums
|
n_checksums
|
||||||
|
@ -782,13 +789,13 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SyncTodo {
|
impl SyncTodo {
|
||||||
fn add_full_scan<F: TableSchema, R: TableReplication>(&mut self, table: &Table<F, R>) {
|
fn add_full_scan<F: TableSchema, R: TableReplication>(&mut self, data: &TableData<F>, aux: &TableAux<F, R>) {
|
||||||
let my_id = table.system.id;
|
let my_id = aux.system.id;
|
||||||
|
|
||||||
self.todo.clear();
|
self.todo.clear();
|
||||||
|
|
||||||
let ring = table.system.ring.borrow().clone();
|
let ring = aux.system.ring.borrow().clone();
|
||||||
let split_points = table.replication.split_points(&ring);
|
let split_points = aux.replication.split_points(&ring);
|
||||||
|
|
||||||
for i in 0..split_points.len() - 1 {
|
for i in 0..split_points.len() - 1 {
|
||||||
let begin = split_points[i];
|
let begin = split_points[i];
|
||||||
|
@ -797,12 +804,12 @@ impl SyncTodo {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let nodes = table.replication.replication_nodes(&begin, &ring);
|
let nodes = aux.replication.replication_nodes(&begin, &ring);
|
||||||
|
|
||||||
let retain = nodes.contains(&my_id);
|
let retain = nodes.contains(&my_id);
|
||||||
if !retain {
|
if !retain {
|
||||||
// Check if we have some data to send, otherwise skip
|
// Check if we have some data to send, otherwise skip
|
||||||
if table.store.range(begin..end).next().is_none() {
|
if data.store.range(begin..end).next().is_none() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -813,25 +820,25 @@ impl SyncTodo {
|
||||||
|
|
||||||
fn add_ring_difference<F: TableSchema, R: TableReplication>(
|
fn add_ring_difference<F: TableSchema, R: TableReplication>(
|
||||||
&mut self,
|
&mut self,
|
||||||
table: &Table<F, R>,
|
|
||||||
old_ring: &Ring,
|
old_ring: &Ring,
|
||||||
new_ring: &Ring,
|
new_ring: &Ring,
|
||||||
|
data: &TableData<F>, aux: &TableAux<F, R>,
|
||||||
) {
|
) {
|
||||||
let my_id = table.system.id;
|
let my_id = aux.system.id;
|
||||||
|
|
||||||
// If it is us who are entering or leaving the system,
|
// If it is us who are entering or leaving the system,
|
||||||
// initiate a full sync instead of incremental sync
|
// initiate a full sync instead of incremental sync
|
||||||
if old_ring.config.members.contains_key(&my_id)
|
if old_ring.config.members.contains_key(&my_id)
|
||||||
!= new_ring.config.members.contains_key(&my_id)
|
!= new_ring.config.members.contains_key(&my_id)
|
||||||
{
|
{
|
||||||
self.add_full_scan(table);
|
self.add_full_scan(data, aux);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut all_points = None
|
let mut all_points = None
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.chain(table.replication.split_points(old_ring).drain(..))
|
.chain(aux.replication.split_points(old_ring).drain(..))
|
||||||
.chain(table.replication.split_points(new_ring).drain(..))
|
.chain(aux.replication.split_points(new_ring).drain(..))
|
||||||
.chain(self.todo.iter().map(|x| x.begin))
|
.chain(self.todo.iter().map(|x| x.begin))
|
||||||
.chain(self.todo.iter().map(|x| x.end))
|
.chain(self.todo.iter().map(|x| x.end))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
@ -845,11 +852,11 @@ impl SyncTodo {
|
||||||
for i in 0..all_points.len() - 1 {
|
for i in 0..all_points.len() - 1 {
|
||||||
let begin = all_points[i];
|
let begin = all_points[i];
|
||||||
let end = all_points[i + 1];
|
let end = all_points[i + 1];
|
||||||
let was_ours = table
|
let was_ours = aux
|
||||||
.replication
|
.replication
|
||||||
.replication_nodes(&begin, &old_ring)
|
.replication_nodes(&begin, &old_ring)
|
||||||
.contains(&my_id);
|
.contains(&my_id);
|
||||||
let is_ours = table
|
let is_ours = aux
|
||||||
.replication
|
.replication
|
||||||
.replication_nodes(&begin, &new_ring)
|
.replication_nodes(&begin, &new_ring)
|
||||||
.contains(&my_id);
|
.contains(&my_id);
|
||||||
|
|
Loading…
Reference in a new issue