forked from Deuxfleurs/garage
Add support for fully replicated tables with epidemic dissemination of updates
This commit is contained in:
parent
7131553c53
commit
302502f4c1
5 changed files with 122 additions and 7 deletions
|
@ -4,6 +4,7 @@ mod error;
|
||||||
mod background;
|
mod background;
|
||||||
mod membership;
|
mod membership;
|
||||||
mod table;
|
mod table;
|
||||||
|
mod table_fullcopy;
|
||||||
mod table_sharded;
|
mod table_sharded;
|
||||||
mod table_sync;
|
mod table_sync;
|
||||||
|
|
||||||
|
|
20
src/table.rs
20
src/table.rs
|
@ -427,6 +427,8 @@ where
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
mut entries: Vec<Arc<ByteBuf>>,
|
mut entries: Vec<Arc<ByteBuf>>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
|
let mut epidemic_propagate = vec![];
|
||||||
|
|
||||||
for update_bytes in entries.drain(..) {
|
for update_bytes in entries.drain(..) {
|
||||||
let update = rmp_serde::decode::from_read_ref::<_, F::E>(update_bytes.as_slice())?;
|
let update = rmp_serde::decode::from_read_ref::<_, F::E>(update_bytes.as_slice())?;
|
||||||
|
|
||||||
|
@ -449,16 +451,28 @@ where
|
||||||
.map_err(Error::RMPEncode)
|
.map_err(Error::RMPEncode)
|
||||||
.map_err(sled::ConflictableTransactionError::Abort)?;
|
.map_err(sled::ConflictableTransactionError::Abort)?;
|
||||||
db.insert(tree_key.clone(), new_bytes)?;
|
db.insert(tree_key.clone(), new_bytes)?;
|
||||||
Ok((old_entry, Some(new_entry)))
|
Ok((old_entry, new_entry))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
if old_entry != new_entry {
|
if old_entry.as_ref() != Some(&new_entry) {
|
||||||
self.instance.updated(old_entry, new_entry).await;
|
if self.replication.epidemic_writes() {
|
||||||
|
epidemic_propagate.push(new_entry.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
self.instance.updated(old_entry, Some(new_entry)).await;
|
||||||
|
|
||||||
let syncer = self.syncer.load_full().unwrap();
|
let syncer = self.syncer.load_full().unwrap();
|
||||||
self.system.background.spawn(syncer.invalidate(tree_key));
|
self.system.background.spawn(syncer.invalidate(tree_key));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if epidemic_propagate.len() > 0 {
|
||||||
|
let self2 = self.clone();
|
||||||
|
self.system
|
||||||
|
.background
|
||||||
|
.spawn(async move { self2.insert_many(&epidemic_propagate[..]).await });
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
100
src/table_fullcopy.rs
Normal file
100
src/table_fullcopy.rs
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
use arc_swap::ArcSwapOption;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use crate::data::*;
|
||||||
|
use crate::membership::{Ring, System};
|
||||||
|
use crate::table::*;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct TableFullReplication {
|
||||||
|
pub write_factor: usize,
|
||||||
|
pub write_quorum: usize,
|
||||||
|
|
||||||
|
neighbors: ArcSwapOption<Neighbors>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct Neighbors {
|
||||||
|
ring: Arc<Ring>,
|
||||||
|
neighbors: Vec<UUID>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TableFullReplication {
|
||||||
|
pub fn new(write_factor: usize, write_quorum: usize) -> Self {
|
||||||
|
TableFullReplication {
|
||||||
|
write_factor,
|
||||||
|
write_quorum,
|
||||||
|
neighbors: ArcSwapOption::from(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_neighbors(&self, system: &System) -> Vec<UUID> {
|
||||||
|
let neighbors = self.neighbors.load_full();
|
||||||
|
if let Some(n) = neighbors {
|
||||||
|
if Arc::ptr_eq(&n.ring, &system.ring.borrow()) {
|
||||||
|
return n.neighbors.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recalculate neighbors
|
||||||
|
let ring = system.ring.borrow().clone();
|
||||||
|
let my_id = system.id.clone();
|
||||||
|
|
||||||
|
let mut nodes = vec![];
|
||||||
|
for (node, _) in ring.config.members.iter() {
|
||||||
|
let node_ranking = hash(&[node.as_slice(), my_id.as_slice()].concat());
|
||||||
|
nodes.push((node.clone(), node_ranking));
|
||||||
|
}
|
||||||
|
nodes.sort_by(|(_, rank1), (_, rank2)| rank1.cmp(rank2));
|
||||||
|
let mut neighbors = nodes
|
||||||
|
.drain(..)
|
||||||
|
.map(|(node, _)| node)
|
||||||
|
.filter(|node| *node != my_id)
|
||||||
|
.take(self.write_factor)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
neighbors.push(my_id);
|
||||||
|
self.neighbors.swap(Some(Arc::new(Neighbors {
|
||||||
|
ring,
|
||||||
|
neighbors: neighbors.clone(),
|
||||||
|
})));
|
||||||
|
neighbors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TableReplication for TableFullReplication {
|
||||||
|
// Full replication schema: all nodes store everything
|
||||||
|
// Writes are disseminated in an epidemic manner in the network
|
||||||
|
|
||||||
|
// Advantage: do all reads locally, extremely fast
|
||||||
|
// Inconvenient: only suitable to reasonably small tables
|
||||||
|
|
||||||
|
fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec<UUID> {
|
||||||
|
vec![system.id.clone()]
|
||||||
|
}
|
||||||
|
fn read_quorum(&self) -> usize {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_nodes(&self, _hash: &Hash, system: &System) -> Vec<UUID> {
|
||||||
|
self.get_neighbors(system)
|
||||||
|
}
|
||||||
|
fn write_quorum(&self) -> usize {
|
||||||
|
self.write_quorum
|
||||||
|
}
|
||||||
|
fn max_write_errors(&self) -> usize {
|
||||||
|
self.write_factor - self.write_quorum
|
||||||
|
}
|
||||||
|
fn epidemic_writes(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec<UUID> {
|
||||||
|
ring.config.members.keys().cloned().collect::<Vec<_>>()
|
||||||
|
}
|
||||||
|
fn split_points(&self, _ring: &Ring) -> Vec<Hash> {
|
||||||
|
let mut ret = vec![];
|
||||||
|
ret.push([0u8; 32].into());
|
||||||
|
ret.push([0xFFu8; 32].into());
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
use crate::data::*;
|
use crate::data::*;
|
||||||
use crate::membership::{System, Ring};
|
use crate::membership::{Ring, System};
|
||||||
use crate::table::*;
|
use crate::table::*;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
|
|
|
@ -604,7 +604,7 @@ impl SyncTodo {
|
||||||
for i in 0..split_points.len() - 1 {
|
for i in 0..split_points.len() - 1 {
|
||||||
let begin = split_points[i].clone();
|
let begin = split_points[i].clone();
|
||||||
let end = split_points[i + 1].clone();
|
let end = split_points[i + 1].clone();
|
||||||
let nodes = table.replication.write_nodes_from_ring(&begin, &ring);
|
let nodes = table.replication.replication_nodes(&begin, &ring);
|
||||||
|
|
||||||
let retain = nodes.contains(&my_id);
|
let retain = nodes.contains(&my_id);
|
||||||
if !retain {
|
if !retain {
|
||||||
|
@ -650,11 +650,11 @@ impl SyncTodo {
|
||||||
let end = all_points[i + 1].clone();
|
let end = all_points[i + 1].clone();
|
||||||
let was_ours = table
|
let was_ours = table
|
||||||
.replication
|
.replication
|
||||||
.write_nodes_from_ring(&begin, &old_ring)
|
.replication_nodes(&begin, &old_ring)
|
||||||
.contains(&my_id);
|
.contains(&my_id);
|
||||||
let is_ours = table
|
let is_ours = table
|
||||||
.replication
|
.replication
|
||||||
.write_nodes_from_ring(&begin, &new_ring)
|
.replication_nodes(&begin, &new_ring)
|
||||||
.contains(&my_id);
|
.contains(&my_id);
|
||||||
|
|
||||||
let was_todo = match old_todo.binary_search_by(|x| x.begin.cmp(&begin)) {
|
let was_todo = match old_todo.binary_search_by(|x| x.begin.cmp(&begin)) {
|
||||||
|
|
Loading…
Reference in a new issue