forked from Deuxfleurs/garage
Simplify replication logic
This commit is contained in:
parent
6a8439fd13
commit
1d9961e411
9 changed files with 60 additions and 58 deletions
|
@ -319,10 +319,8 @@ impl BlockManager {
|
||||||
if exists && !needed {
|
if exists && !needed {
|
||||||
trace!("Offloading block {:?}", hash);
|
trace!("Offloading block {:?}", hash);
|
||||||
|
|
||||||
let ring = self.system.ring.borrow().clone();
|
let mut who = self.replication.write_nodes(&hash);
|
||||||
|
if who.len() < self.replication.write_quorum() {
|
||||||
let mut who = self.replication.replication_nodes(&hash, &ring);
|
|
||||||
if who.len() < self.replication.write_quorum(&self.system) {
|
|
||||||
return Err(Error::Message(format!("Not trying to offload block because we don't have a quorum of nodes to write to")));
|
return Err(Error::Message(format!("Not trying to offload block because we don't have a quorum of nodes to write to")));
|
||||||
}
|
}
|
||||||
who.retain(|id| *id != self.system.id);
|
who.retain(|id| *id != self.system.id);
|
||||||
|
@ -367,7 +365,7 @@ impl BlockManager {
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
trace!(
|
info!(
|
||||||
"Deleting block {:?}, offload finished ({} / {})",
|
"Deleting block {:?}, offload finished ({} / {})",
|
||||||
hash,
|
hash,
|
||||||
need_nodes.len(),
|
need_nodes.len(),
|
||||||
|
@ -391,7 +389,7 @@ impl BlockManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
|
pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
|
||||||
let who = self.replication.read_nodes(&hash, &self.system);
|
let who = self.replication.read_nodes(&hash);
|
||||||
let resps = self
|
let resps = self
|
||||||
.rpc_client
|
.rpc_client
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
|
@ -415,12 +413,12 @@ impl BlockManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
|
pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
|
||||||
let who = self.replication.write_nodes(&hash, &self.system);
|
let who = self.replication.write_nodes(&hash);
|
||||||
self.rpc_client
|
self.rpc_client
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
&who[..],
|
&who[..],
|
||||||
Message::PutBlock(PutBlockMessage { hash, data }),
|
Message::PutBlock(PutBlockMessage { hash, data }),
|
||||||
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system))
|
RequestStrategy::with_quorum(self.replication.write_quorum())
|
||||||
.with_timeout(BLOCK_RW_TIMEOUT),
|
.with_timeout(BLOCK_RW_TIMEOUT),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
|
@ -54,18 +54,23 @@ impl Garage {
|
||||||
);
|
);
|
||||||
|
|
||||||
let data_rep_param = TableShardedReplication {
|
let data_rep_param = TableShardedReplication {
|
||||||
|
system: system.clone(),
|
||||||
replication_factor: config.data_replication_factor,
|
replication_factor: config.data_replication_factor,
|
||||||
write_quorum: (config.data_replication_factor + 1) / 2,
|
write_quorum: (config.data_replication_factor + 1) / 2,
|
||||||
read_quorum: 1,
|
read_quorum: 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
let meta_rep_param = TableShardedReplication {
|
let meta_rep_param = TableShardedReplication {
|
||||||
|
system: system.clone(),
|
||||||
replication_factor: config.meta_replication_factor,
|
replication_factor: config.meta_replication_factor,
|
||||||
write_quorum: (config.meta_replication_factor + 1) / 2,
|
write_quorum: (config.meta_replication_factor + 1) / 2,
|
||||||
read_quorum: (config.meta_replication_factor + 1) / 2,
|
read_quorum: (config.meta_replication_factor + 1) / 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
let control_rep_param = TableFullReplication::new(config.control_write_max_faults);
|
let control_rep_param = TableFullReplication {
|
||||||
|
system: system.clone(),
|
||||||
|
max_faults: config.control_write_max_faults,
|
||||||
|
};
|
||||||
|
|
||||||
info!("Initialize block manager...");
|
info!("Initialize block manager...");
|
||||||
let block_manager = BlockManager::new(
|
let block_manager = BlockManager::new(
|
||||||
|
|
|
@ -170,6 +170,11 @@ impl Ring {
|
||||||
Self { config, ring }
|
Self { config, ring }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn partition_of(&self, from: &Hash) -> u16 {
|
||||||
|
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
|
||||||
|
top >> (16 - PARTITION_BITS)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
|
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
|
||||||
if self.ring.len() != 1 << PARTITION_BITS {
|
if self.ring.len() != 1 << PARTITION_BITS {
|
||||||
warn!("Ring not yet ready, read/writes will be lost!");
|
warn!("Ring not yet ready, read/writes will be lost!");
|
||||||
|
@ -177,8 +182,9 @@ impl Ring {
|
||||||
}
|
}
|
||||||
|
|
||||||
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
|
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
|
||||||
|
|
||||||
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
|
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
|
||||||
|
assert_eq!(partition_idx, self.partition_of(from) as usize);
|
||||||
|
|
||||||
let partition = &self.ring[partition_idx];
|
let partition = &self.ring[partition_idx];
|
||||||
|
|
||||||
let partition_top =
|
let partition_top =
|
||||||
|
|
|
@ -130,7 +130,7 @@ where
|
||||||
let mut partitions = HashMap::new();
|
let mut partitions = HashMap::new();
|
||||||
for (k, vhash, v) in entries {
|
for (k, vhash, v) in entries {
|
||||||
let pkh = Hash::try_from(&k[..32]).unwrap();
|
let pkh = Hash::try_from(&k[..32]).unwrap();
|
||||||
let mut nodes = self.aux.replication.write_nodes(&pkh, &self.aux.system);
|
let mut nodes = self.aux.replication.write_nodes(&pkh);
|
||||||
nodes.retain(|x| *x != self.aux.system.id);
|
nodes.retain(|x| *x != self.aux.system.id);
|
||||||
nodes.sort();
|
nodes.sort();
|
||||||
|
|
||||||
|
|
|
@ -8,21 +8,10 @@ use crate::replication::*;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TableFullReplication {
|
pub struct TableFullReplication {
|
||||||
|
pub system: Arc<System>,
|
||||||
pub max_faults: usize,
|
pub max_faults: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct Neighbors {
|
|
||||||
ring: Arc<Ring>,
|
|
||||||
neighbors: Vec<UUID>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TableFullReplication {
|
|
||||||
pub fn new(max_faults: usize) -> Self {
|
|
||||||
TableFullReplication { max_faults }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TableReplication for TableFullReplication {
|
impl TableReplication for TableFullReplication {
|
||||||
// Full replication schema: all nodes store everything
|
// Full replication schema: all nodes store everything
|
||||||
// Writes are disseminated in an epidemic manner in the network
|
// Writes are disseminated in an epidemic manner in the network
|
||||||
|
@ -30,18 +19,23 @@ impl TableReplication for TableFullReplication {
|
||||||
// Advantage: do all reads locally, extremely fast
|
// Advantage: do all reads locally, extremely fast
|
||||||
// Inconvenient: only suitable to reasonably small tables
|
// Inconvenient: only suitable to reasonably small tables
|
||||||
|
|
||||||
fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec<UUID> {
|
fn partition_of(&self, _hash: &Hash) -> u16 {
|
||||||
vec![system.id]
|
0u16
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_nodes(&self, _hash: &Hash) -> Vec<UUID> {
|
||||||
|
vec![self.system.id]
|
||||||
}
|
}
|
||||||
fn read_quorum(&self) -> usize {
|
fn read_quorum(&self) -> usize {
|
||||||
1
|
1
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
|
fn write_nodes(&self, _hash: &Hash) -> Vec<UUID> {
|
||||||
self.replication_nodes(hash, system.ring.borrow().as_ref())
|
let ring = self.system.ring.borrow();
|
||||||
|
ring.config.members.keys().cloned().collect::<Vec<_>>()
|
||||||
}
|
}
|
||||||
fn write_quorum(&self, system: &System) -> usize {
|
fn write_quorum(&self) -> usize {
|
||||||
let nmembers = system.ring.borrow().config.members.len();
|
let nmembers = self.system.ring.borrow().config.members.len();
|
||||||
if nmembers > self.max_faults {
|
if nmembers > self.max_faults {
|
||||||
nmembers - self.max_faults
|
nmembers - self.max_faults
|
||||||
} else {
|
} else {
|
||||||
|
@ -52,9 +46,6 @@ impl TableReplication for TableFullReplication {
|
||||||
self.max_faults
|
self.max_faults
|
||||||
}
|
}
|
||||||
|
|
||||||
fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec<UUID> {
|
|
||||||
ring.config.members.keys().cloned().collect::<Vec<_>>()
|
|
||||||
}
|
|
||||||
fn split_points(&self, _ring: &Ring) -> Vec<Hash> {
|
fn split_points(&self, _ring: &Ring) -> Vec<Hash> {
|
||||||
let mut ret = vec![];
|
let mut ret = vec![];
|
||||||
ret.push([0u8; 32].into());
|
ret.push([0u8; 32].into());
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
use garage_rpc::membership::System;
|
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
@ -7,16 +6,18 @@ pub trait TableReplication: Send + Sync {
|
||||||
// See examples in table_sharded.rs and table_fullcopy.rs
|
// See examples in table_sharded.rs and table_fullcopy.rs
|
||||||
// To understand various replication methods
|
// To understand various replication methods
|
||||||
|
|
||||||
|
// Partition number of data item (for Merkle tree)
|
||||||
|
fn partition_of(&self, hash: &Hash) -> u16;
|
||||||
|
|
||||||
// Which nodes to send reads from
|
// Which nodes to send reads from
|
||||||
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
|
fn read_nodes(&self, hash: &Hash) -> Vec<UUID>;
|
||||||
fn read_quorum(&self) -> usize;
|
fn read_quorum(&self) -> usize;
|
||||||
|
|
||||||
// Which nodes to send writes to
|
// Which nodes to send writes to
|
||||||
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
|
fn write_nodes(&self, hash: &Hash) -> Vec<UUID>;
|
||||||
fn write_quorum(&self, system: &System) -> usize;
|
fn write_quorum(&self) -> usize;
|
||||||
fn max_write_errors(&self) -> usize;
|
fn max_write_errors(&self) -> usize;
|
||||||
|
|
||||||
// Which are the nodes that do actually replicate the data
|
// Get partition boundaries
|
||||||
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
|
|
||||||
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
|
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use garage_rpc::membership::System;
|
use garage_rpc::membership::System;
|
||||||
use garage_rpc::ring::Ring;
|
use garage_rpc::ring::Ring;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
@ -6,6 +8,7 @@ use crate::replication::*;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TableShardedReplication {
|
pub struct TableShardedReplication {
|
||||||
|
pub system: Arc<System>,
|
||||||
pub replication_factor: usize,
|
pub replication_factor: usize,
|
||||||
pub read_quorum: usize,
|
pub read_quorum: usize,
|
||||||
pub write_quorum: usize,
|
pub write_quorum: usize,
|
||||||
|
@ -19,28 +22,29 @@ impl TableReplication for TableShardedReplication {
|
||||||
// - reads are done on all of the nodes that replicate the data
|
// - reads are done on all of the nodes that replicate the data
|
||||||
// - writes as well
|
// - writes as well
|
||||||
|
|
||||||
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
|
fn partition_of(&self, hash: &Hash) -> u16 {
|
||||||
let ring = system.ring.borrow().clone();
|
self.system.ring.borrow().partition_of(hash)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_nodes(&self, hash: &Hash) -> Vec<UUID> {
|
||||||
|
let ring = self.system.ring.borrow().clone();
|
||||||
ring.walk_ring(&hash, self.replication_factor)
|
ring.walk_ring(&hash, self.replication_factor)
|
||||||
}
|
}
|
||||||
fn read_quorum(&self) -> usize {
|
fn read_quorum(&self) -> usize {
|
||||||
self.read_quorum
|
self.read_quorum
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
|
fn write_nodes(&self, hash: &Hash) -> Vec<UUID> {
|
||||||
let ring = system.ring.borrow().clone();
|
let ring = self.system.ring.borrow();
|
||||||
ring.walk_ring(&hash, self.replication_factor)
|
ring.walk_ring(&hash, self.replication_factor)
|
||||||
}
|
}
|
||||||
fn write_quorum(&self, _system: &System) -> usize {
|
fn write_quorum(&self) -> usize {
|
||||||
self.write_quorum
|
self.write_quorum
|
||||||
}
|
}
|
||||||
fn max_write_errors(&self) -> usize {
|
fn max_write_errors(&self) -> usize {
|
||||||
self.replication_factor - self.write_quorum
|
self.replication_factor - self.write_quorum
|
||||||
}
|
}
|
||||||
|
|
||||||
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID> {
|
|
||||||
ring.walk_ring(&hash, self.replication_factor)
|
|
||||||
}
|
|
||||||
fn split_points(&self, ring: &Ring) -> Vec<Hash> {
|
fn split_points(&self, ring: &Ring) -> Vec<Hash> {
|
||||||
let mut ret = vec![];
|
let mut ret = vec![];
|
||||||
|
|
||||||
|
|
|
@ -218,10 +218,7 @@ where
|
||||||
let nodes = self
|
let nodes = self
|
||||||
.aux
|
.aux
|
||||||
.replication
|
.replication
|
||||||
.write_nodes(
|
.write_nodes(&hash_of_merkle_partition(partition.range.begin))
|
||||||
&hash_of_merkle_partition(partition.range.begin),
|
|
||||||
&self.aux.system,
|
|
||||||
)
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|node| *node != my_id)
|
.filter(|node| *node != my_id)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
@ -293,7 +290,7 @@ where
|
||||||
let nodes = self
|
let nodes = self
|
||||||
.aux
|
.aux
|
||||||
.replication
|
.replication
|
||||||
.write_nodes(&begin, &self.aux.system)
|
.write_nodes(&begin)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
if nodes.contains(&self.aux.system.id) {
|
if nodes.contains(&self.aux.system.id) {
|
||||||
|
@ -303,7 +300,7 @@ where
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if nodes.len() < self.aux.replication.write_quorum(&self.aux.system) {
|
if nodes.len() < self.aux.replication.write_quorum() {
|
||||||
return Err(Error::Message(format!(
|
return Err(Error::Message(format!(
|
||||||
"Not offloading as we don't have a quorum of nodes to write to."
|
"Not offloading as we don't have a quorum of nodes to write to."
|
||||||
)));
|
)));
|
||||||
|
@ -616,7 +613,7 @@ impl SyncTodo {
|
||||||
let begin_hash = hash_of_merkle_partition(begin);
|
let begin_hash = hash_of_merkle_partition(begin);
|
||||||
let end_hash = hash_of_merkle_partition_opt(end);
|
let end_hash = hash_of_merkle_partition_opt(end);
|
||||||
|
|
||||||
let nodes = aux.replication.replication_nodes(&begin_hash, &ring);
|
let nodes = aux.replication.write_nodes(&begin_hash);
|
||||||
|
|
||||||
let retain = nodes.contains(&my_id);
|
let retain = nodes.contains(&my_id);
|
||||||
if !retain {
|
if !retain {
|
||||||
|
|
|
@ -91,7 +91,7 @@ where
|
||||||
|
|
||||||
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
|
||||||
let hash = e.partition_key().hash();
|
let hash = e.partition_key().hash();
|
||||||
let who = self.aux.replication.write_nodes(&hash, &self.aux.system);
|
let who = self.aux.replication.write_nodes(&hash);
|
||||||
//eprintln!("insert who: {:?}", who);
|
//eprintln!("insert who: {:?}", who);
|
||||||
|
|
||||||
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
|
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
|
||||||
|
@ -101,7 +101,7 @@ where
|
||||||
.try_call_many(
|
.try_call_many(
|
||||||
&who[..],
|
&who[..],
|
||||||
rpc,
|
rpc,
|
||||||
RequestStrategy::with_quorum(self.aux.replication.write_quorum(&self.aux.system))
|
RequestStrategy::with_quorum(self.aux.replication.write_quorum())
|
||||||
.with_timeout(TABLE_RPC_TIMEOUT),
|
.with_timeout(TABLE_RPC_TIMEOUT),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -113,7 +113,7 @@ where
|
||||||
|
|
||||||
for entry in entries.iter() {
|
for entry in entries.iter() {
|
||||||
let hash = entry.partition_key().hash();
|
let hash = entry.partition_key().hash();
|
||||||
let who = self.aux.replication.write_nodes(&hash, &self.aux.system);
|
let who = self.aux.replication.write_nodes(&hash);
|
||||||
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
|
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
|
||||||
for node in who {
|
for node in who {
|
||||||
if !call_list.contains_key(&node) {
|
if !call_list.contains_key(&node) {
|
||||||
|
@ -150,7 +150,7 @@ where
|
||||||
sort_key: &F::S,
|
sort_key: &F::S,
|
||||||
) -> Result<Option<F::E>, Error> {
|
) -> Result<Option<F::E>, Error> {
|
||||||
let hash = partition_key.hash();
|
let hash = partition_key.hash();
|
||||||
let who = self.aux.replication.read_nodes(&hash, &self.aux.system);
|
let who = self.aux.replication.read_nodes(&hash);
|
||||||
//eprintln!("get who: {:?}", who);
|
//eprintln!("get who: {:?}", who);
|
||||||
|
|
||||||
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
|
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
|
||||||
|
@ -207,7 +207,7 @@ where
|
||||||
limit: usize,
|
limit: usize,
|
||||||
) -> Result<Vec<F::E>, Error> {
|
) -> Result<Vec<F::E>, Error> {
|
||||||
let hash = partition_key.hash();
|
let hash = partition_key.hash();
|
||||||
let who = self.aux.replication.read_nodes(&hash, &self.aux.system);
|
let who = self.aux.replication.read_nodes(&hash);
|
||||||
|
|
||||||
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
|
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue