forked from Deuxfleurs/garage
Not fully tested: new multi-dc MagLev
This commit is contained in:
parent
3882d5ba36
commit
d7e005251d
4 changed files with 152 additions and 94 deletions
|
@ -262,21 +262,21 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
print("------")
|
print("------")
|
||||||
print("method 2 (custom ring)")
|
print("method 2 (custom ring)")
|
||||||
nodes = [('digitale', 'atuin', 4),
|
nodes = [('digitale', 'atuin', 1),
|
||||||
('drosera', 'atuin', 4),
|
('drosera', 'atuin', 1),
|
||||||
('datura', 'atuin', 4),
|
('datura', 'atuin', 1),
|
||||||
('io', 'jupiter', 8)]
|
('io', 'jupiter', 2)]
|
||||||
nodes2 = [('digitale', 'atuin', 8),
|
nodes2 = [('digitale', 'atuin', 2),
|
||||||
('drosera', 'atuin', 8),
|
('drosera', 'atuin', 2),
|
||||||
('datura', 'atuin', 8),
|
('datura', 'atuin', 2),
|
||||||
('io', 'jupiter', 16),
|
('io', 'jupiter', 4),
|
||||||
('isou', 'jupiter', 8),
|
('isou', 'jupiter', 2),
|
||||||
('mini', 'grog', 4),
|
('mini', 'grog', 1),
|
||||||
('mixi', 'grog', 4),
|
('mixi', 'grog', 1),
|
||||||
('moxi', 'grog', 4),
|
('moxi', 'grog', 1),
|
||||||
('modi', 'grog', 4),
|
('modi', 'grog', 1),
|
||||||
('geant', 'grisou', 16),
|
('geant', 'grisou', 4),
|
||||||
('gipsie', 'grisou', 16),
|
('gipsie', 'grisou', 4),
|
||||||
]
|
]
|
||||||
evaluate_method(method2, nodes2)
|
evaluate_method(method2, nodes2)
|
||||||
|
|
||||||
|
|
|
@ -218,12 +218,7 @@ impl System {
|
||||||
.unwrap_or("<invalid utf-8>".to_string()),
|
.unwrap_or("<invalid utf-8>".to_string()),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut ring = Ring {
|
let ring = Ring::new(net_config);
|
||||||
config: net_config,
|
|
||||||
ring: Vec::new(),
|
|
||||||
n_datacenters: 0,
|
|
||||||
};
|
|
||||||
ring.rebuild_ring();
|
|
||||||
let (update_ring, ring) = watch::channel(Arc::new(ring));
|
let (update_ring, ring) = watch::channel(Arc::new(ring));
|
||||||
|
|
||||||
let rpc_path = MEMBERSHIP_RPC_PATH.to_string();
|
let rpc_path = MEMBERSHIP_RPC_PATH.to_string();
|
||||||
|
@ -531,10 +526,7 @@ impl System {
|
||||||
let ring: Arc<Ring> = self.ring.borrow().clone();
|
let ring: Arc<Ring> = self.ring.borrow().clone();
|
||||||
|
|
||||||
if adv.version > ring.config.version {
|
if adv.version > ring.config.version {
|
||||||
let mut ring = ring.as_ref().clone();
|
let ring = Ring::new(adv.clone());
|
||||||
|
|
||||||
ring.config = adv.clone();
|
|
||||||
ring.rebuild_ring();
|
|
||||||
update_lock.1.broadcast(Arc::new(ring))?;
|
update_lock.1.broadcast(Arc::new(ring))?;
|
||||||
drop(update_lock);
|
drop(update_lock);
|
||||||
|
|
||||||
|
|
203
src/rpc/ring.rs
203
src/rpc/ring.rs
|
@ -1,9 +1,22 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
|
// TODO: make this constant parametrizable in the config file
|
||||||
|
// For deployments with many nodes it might make sense to bump
|
||||||
|
// it up to 10.
|
||||||
|
// Maximum value : 16
|
||||||
|
pub const PARTITION_BITS: usize = 8;
|
||||||
|
|
||||||
|
const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
|
||||||
|
|
||||||
|
// TODO: make this constant paraetrizable in the config file
|
||||||
|
// (most deployments use a replication factor of 3, so...)
|
||||||
|
pub const MAX_REPLICATION: usize = 3;
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
pub struct NetworkConfig {
|
pub struct NetworkConfig {
|
||||||
pub members: HashMap<UUID, NetworkConfigEntry>,
|
pub members: HashMap<UUID, NetworkConfigEntry>,
|
||||||
|
@ -30,96 +43,150 @@ pub struct NetworkConfigEntry {
|
||||||
pub struct Ring {
|
pub struct Ring {
|
||||||
pub config: NetworkConfig,
|
pub config: NetworkConfig,
|
||||||
pub ring: Vec<RingEntry>,
|
pub ring: Vec<RingEntry>,
|
||||||
pub n_datacenters: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct RingEntry {
|
pub struct RingEntry {
|
||||||
pub location: Hash,
|
pub location: Hash,
|
||||||
pub node: UUID,
|
pub nodes: [UUID; MAX_REPLICATION],
|
||||||
datacenter: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Ring {
|
impl Ring {
|
||||||
pub(crate) fn rebuild_ring(&mut self) {
|
pub(crate) fn new(config: NetworkConfig) -> Self {
|
||||||
let mut new_ring = vec![];
|
// Create a vector of partition indices (0 to 2**PARTITION_BITS-1)
|
||||||
let mut datacenters = vec![];
|
let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>();
|
||||||
|
|
||||||
for (id, config) in self.config.members.iter() {
|
let datacenters = config
|
||||||
let datacenter = &config.datacenter;
|
.members
|
||||||
|
.iter()
|
||||||
|
.map(|(_id, info)| info.datacenter.as_str())
|
||||||
|
.collect::<HashSet<&str>>();
|
||||||
|
let n_datacenters = datacenters.len();
|
||||||
|
|
||||||
if !datacenters.contains(datacenter) {
|
// Prepare ring
|
||||||
datacenters.push(datacenter.to_string());
|
let mut partitions: Vec<Vec<(&UUID, &NetworkConfigEntry)>> = partitions_idx
|
||||||
|
.iter()
|
||||||
|
.map(|_i| Vec::new())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// Create MagLev priority queues for each node
|
||||||
|
let mut queues = config
|
||||||
|
.members
|
||||||
|
.iter()
|
||||||
|
.map(|(node_id, node_info)| {
|
||||||
|
let mut parts = partitions_idx
|
||||||
|
.iter()
|
||||||
|
.map(|i| {
|
||||||
|
let part_data =
|
||||||
|
[&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat();
|
||||||
|
(*i, fasthash(&part_data[..]))
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
parts.sort_by_key(|(_i, h)| *h);
|
||||||
|
let parts_i = parts.iter().map(|(i, _h)| *i).collect::<Vec<_>>();
|
||||||
|
(node_id, node_info, parts_i, 0)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let max_toktok = config
|
||||||
|
.members
|
||||||
|
.iter()
|
||||||
|
.map(|(_, node_info)| node_info.n_tokens)
|
||||||
|
.fold(0, std::cmp::max);
|
||||||
|
|
||||||
|
// Fill up ring
|
||||||
|
for rep in 0..MAX_REPLICATION {
|
||||||
|
queues.sort_by_key(|(ni, _np, _q, _p)| {
|
||||||
|
let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat();
|
||||||
|
fasthash(&queue_data[..])
|
||||||
|
});
|
||||||
|
|
||||||
|
for (_, _, _, pos) in queues.iter_mut() {
|
||||||
|
*pos = 0;
|
||||||
}
|
}
|
||||||
let datacenter_idx = datacenters
|
|
||||||
|
let mut remaining = partitions_idx.len();
|
||||||
|
while remaining > 0 {
|
||||||
|
let remaining0 = remaining;
|
||||||
|
for toktok in 0..max_toktok {
|
||||||
|
for (node_id, node_info, q, pos) in queues.iter_mut() {
|
||||||
|
if toktok >= node_info.n_tokens {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for pos2 in *pos..q.len() {
|
||||||
|
let qv = q[pos2];
|
||||||
|
if partitions[qv].len() != rep {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let p_dcs = partitions[qv]
|
||||||
|
.iter()
|
||||||
|
.map(|(_id, info)| info.datacenter.as_str())
|
||||||
|
.collect::<HashSet<&str>>();
|
||||||
|
if !partitions[qv]
|
||||||
|
.iter()
|
||||||
|
.any(|(_id, i)| *i.datacenter == node_info.datacenter)
|
||||||
|
|| (p_dcs.len() == n_datacenters
|
||||||
|
&& !partitions[qv].iter().any(|(id, _i)| id == node_id))
|
||||||
|
{
|
||||||
|
partitions[qv].push((node_id, node_info));
|
||||||
|
remaining -= 1;
|
||||||
|
*pos = pos2 + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if remaining == remaining0 {
|
||||||
|
// No progress made, exit
|
||||||
|
warn!("Could not build ring, not enough nodes configured.");
|
||||||
|
return Self {
|
||||||
|
config,
|
||||||
|
ring: vec![],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let ring = partitions
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.find(|(_, dc)| *dc == datacenter)
|
.map(|(i, nodes)| {
|
||||||
.unwrap()
|
let top = (i as u16) << (16 - PARTITION_BITS);
|
||||||
.0;
|
let mut hash = [0u8; 32];
|
||||||
|
hash[0..2].copy_from_slice(&u16::to_be_bytes(top)[..]);
|
||||||
for i in 0..config.n_tokens {
|
let nodes = nodes.iter().map(|(id, _info)| **id).collect::<Vec<UUID>>();
|
||||||
let location = sha256sum(format!("{} {}", hex::encode(&id), i).as_bytes());
|
RingEntry {
|
||||||
|
location: hash.into(),
|
||||||
new_ring.push(RingEntry {
|
nodes: nodes.try_into().unwrap(),
|
||||||
location: location.into(),
|
}
|
||||||
node: *id,
|
|
||||||
datacenter: datacenter_idx,
|
|
||||||
})
|
})
|
||||||
}
|
.collect::<Vec<_>>();
|
||||||
}
|
|
||||||
|
|
||||||
new_ring.sort_unstable_by(|x, y| x.location.cmp(&y.location));
|
eprintln!("RING: --");
|
||||||
self.ring = new_ring;
|
for e in ring.iter() {
|
||||||
self.n_datacenters = datacenters.len();
|
eprintln!("{:?}", e);
|
||||||
|
}
|
||||||
|
eprintln!("END --");
|
||||||
|
|
||||||
// eprintln!("RING: --");
|
Self { config, ring }
|
||||||
// for e in self.ring.iter() {
|
|
||||||
// eprintln!("{:?}", e);
|
|
||||||
// }
|
|
||||||
// eprintln!("END --");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
|
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
|
||||||
if n >= self.config.members.len() {
|
if self.ring.len() != 1 << PARTITION_BITS {
|
||||||
return self.config.members.keys().cloned().collect::<Vec<_>>();
|
warn!("Ring not yet ready, read/writes will be lost");
|
||||||
|
return vec![];
|
||||||
}
|
}
|
||||||
|
|
||||||
let start = match self.ring.binary_search_by(|x| x.location.cmp(from)) {
|
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
|
||||||
Ok(i) => i,
|
|
||||||
Err(i) => {
|
|
||||||
if i == 0 {
|
|
||||||
self.ring.len() - 1
|
|
||||||
} else {
|
|
||||||
i - 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
self.walk_ring_from_pos(start, n)
|
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
|
||||||
}
|
let partition = &self.ring[partition_idx];
|
||||||
|
|
||||||
fn walk_ring_from_pos(&self, start: usize, n: usize) -> Vec<UUID> {
|
let partition_top =
|
||||||
if n >= self.config.members.len() {
|
u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap());
|
||||||
return self.config.members.keys().cloned().collect::<Vec<_>>();
|
assert!(partition_top & PARTITION_MASK_U16 == top & PARTITION_MASK_U16);
|
||||||
}
|
|
||||||
|
|
||||||
let mut ret = vec![];
|
assert!(n <= partition.nodes.len());
|
||||||
let mut datacenters = vec![];
|
partition.nodes[..n].iter().cloned().collect::<Vec<_>>()
|
||||||
|
|
||||||
let mut delta = 0;
|
|
||||||
while ret.len() < n {
|
|
||||||
let i = (start + delta) % self.ring.len();
|
|
||||||
delta += 1;
|
|
||||||
|
|
||||||
if !datacenters.contains(&self.ring[i].datacenter) {
|
|
||||||
ret.push(self.ring[i].node);
|
|
||||||
datacenters.push(self.ring[i].datacenter);
|
|
||||||
} else if datacenters.len() == self.n_datacenters && !ret.contains(&self.ring[i].node) {
|
|
||||||
ret.push(self.ring[i].node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ret
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,6 @@ impl TableReplication for TableShardedReplication {
|
||||||
fn split_points(&self, ring: &Ring) -> Vec<Hash> {
|
fn split_points(&self, ring: &Ring) -> Vec<Hash> {
|
||||||
let mut ret = vec![];
|
let mut ret = vec![];
|
||||||
|
|
||||||
ret.push([0u8; 32].into());
|
|
||||||
for entry in ring.ring.iter() {
|
for entry in ring.ring.iter() {
|
||||||
ret.push(entry.location);
|
ret.push(entry.location);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue