From 302502f4c10b4c1cd03d3b098b3e55a3f70054f2 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 19 Apr 2020 15:14:23 +0200 Subject: [PATCH] Add support for fully replicated tables with epidemic dissemination of updates --- src/main.rs | 1 + src/table.rs | 20 +++++++-- src/table_fullcopy.rs | 100 ++++++++++++++++++++++++++++++++++++++++++ src/table_sharded.rs | 2 +- src/table_sync.rs | 6 +-- 5 files changed, 122 insertions(+), 7 deletions(-) create mode 100644 src/table_fullcopy.rs diff --git a/src/main.rs b/src/main.rs index cc9da8e2..89953223 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ mod error; mod background; mod membership; mod table; +mod table_fullcopy; mod table_sharded; mod table_sync; diff --git a/src/table.rs b/src/table.rs index d5357277..619c96d2 100644 --- a/src/table.rs +++ b/src/table.rs @@ -427,6 +427,8 @@ where self: &Arc, mut entries: Vec>, ) -> Result<(), Error> { + let mut epidemic_propagate = vec![]; + for update_bytes in entries.drain(..) { let update = rmp_serde::decode::from_read_ref::<_, F::E>(update_bytes.as_slice())?; @@ -449,16 +451,28 @@ where .map_err(Error::RMPEncode) .map_err(sled::ConflictableTransactionError::Abort)?; db.insert(tree_key.clone(), new_bytes)?; - Ok((old_entry, Some(new_entry))) + Ok((old_entry, new_entry)) })?; - if old_entry != new_entry { - self.instance.updated(old_entry, new_entry).await; + if old_entry.as_ref() != Some(&new_entry) { + if self.replication.epidemic_writes() { + epidemic_propagate.push(new_entry.clone()); + } + + self.instance.updated(old_entry, Some(new_entry)).await; let syncer = self.syncer.load_full().unwrap(); self.system.background.spawn(syncer.invalidate(tree_key)); } } + + if epidemic_propagate.len() > 0 { + let self2 = self.clone(); + self.system + .background + .spawn(async move { self2.insert_many(&epidemic_propagate[..]).await }); + } + Ok(()) } diff --git a/src/table_fullcopy.rs b/src/table_fullcopy.rs new file mode 100644 index 00000000..d5194d55 --- /dev/null +++ b/src/table_fullcopy.rs @@ -0,0 +1,100 @@ +use arc_swap::ArcSwapOption; +use std::sync::Arc; + +use crate::data::*; +use crate::membership::{Ring, System}; +use crate::table::*; + +#[derive(Clone)] +pub struct TableFullReplication { + pub write_factor: usize, + pub write_quorum: usize, + + neighbors: ArcSwapOption, +} + +#[derive(Clone)] +struct Neighbors { + ring: Arc, + neighbors: Vec, +} + +impl TableFullReplication { + pub fn new(write_factor: usize, write_quorum: usize) -> Self { + TableFullReplication { + write_factor, + write_quorum, + neighbors: ArcSwapOption::from(None), + } + } + + fn get_neighbors(&self, system: &System) -> Vec { + let neighbors = self.neighbors.load_full(); + if let Some(n) = neighbors { + if Arc::ptr_eq(&n.ring, &system.ring.borrow()) { + return n.neighbors.clone(); + } + } + + // Recalculate neighbors + let ring = system.ring.borrow().clone(); + let my_id = system.id.clone(); + + let mut nodes = vec![]; + for (node, _) in ring.config.members.iter() { + let node_ranking = hash(&[node.as_slice(), my_id.as_slice()].concat()); + nodes.push((node.clone(), node_ranking)); + } + nodes.sort_by(|(_, rank1), (_, rank2)| rank1.cmp(rank2)); + let mut neighbors = nodes + .drain(..) + .map(|(node, _)| node) + .filter(|node| *node != my_id) + .take(self.write_factor) + .collect::>(); + neighbors.push(my_id); + self.neighbors.swap(Some(Arc::new(Neighbors { + ring, + neighbors: neighbors.clone(), + }))); + neighbors + } +} + +impl TableReplication for TableFullReplication { + // Full replication schema: all nodes store everything + // Writes are disseminated in an epidemic manner in the network + + // Advantage: do all reads locally, extremely fast + // Inconvenient: only suitable to reasonably small tables + + fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec { + vec![system.id.clone()] + } + fn read_quorum(&self) -> usize { + 1 + } + + fn write_nodes(&self, _hash: &Hash, system: &System) -> Vec { + self.get_neighbors(system) + } + fn write_quorum(&self) -> usize { + self.write_quorum + } + fn max_write_errors(&self) -> usize { + self.write_factor - self.write_quorum + } + fn epidemic_writes(&self) -> bool { + true + } + + fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec { + ring.config.members.keys().cloned().collect::>() + } + fn split_points(&self, _ring: &Ring) -> Vec { + let mut ret = vec![]; + ret.push([0u8; 32].into()); + ret.push([0xFFu8; 32].into()); + ret + } +} diff --git a/src/table_sharded.rs b/src/table_sharded.rs index 485a9212..6a174d05 100644 --- a/src/table_sharded.rs +++ b/src/table_sharded.rs @@ -1,5 +1,5 @@ use crate::data::*; -use crate::membership::{System, Ring}; +use crate::membership::{Ring, System}; use crate::table::*; #[derive(Clone)] diff --git a/src/table_sync.rs b/src/table_sync.rs index b4555a77..550ad0f0 100644 --- a/src/table_sync.rs +++ b/src/table_sync.rs @@ -604,7 +604,7 @@ impl SyncTodo { for i in 0..split_points.len() - 1 { let begin = split_points[i].clone(); let end = split_points[i + 1].clone(); - let nodes = table.replication.write_nodes_from_ring(&begin, &ring); + let nodes = table.replication.replication_nodes(&begin, &ring); let retain = nodes.contains(&my_id); if !retain { @@ -650,11 +650,11 @@ impl SyncTodo { let end = all_points[i + 1].clone(); let was_ours = table .replication - .write_nodes_from_ring(&begin, &old_ring) + .replication_nodes(&begin, &old_ring) .contains(&my_id); let is_ours = table .replication - .write_nodes_from_ring(&begin, &new_ring) + .replication_nodes(&begin, &new_ring) .contains(&my_id); let was_todo = match old_todo.binary_search_by(|x| x.begin.cmp(&begin)) {