garage/src/rpc/ring.rs

240 lines
7.2 KiB
Rust
Raw Normal View History

2021-03-22 00:00:09 +01:00
//! Module containing types related to computing nodes which should receive a copy of data blocks
2021-04-06 05:25:28 +02:00
//! and metadata
2021-03-05 16:22:29 +01:00
use std::collections::{HashMap, HashSet};
use std::convert::TryInto;
2021-02-21 13:11:10 +01:00
use serde::{Deserialize, Serialize};
use garage_util::data::*;
2021-03-16 12:18:03 +01:00
// A partition number is encoded on 16 bits,
// i.e. we have up to 2**16 partitions.
// (in practice we have exactly 2**PARTITION_BITS partitions)
2021-03-22 00:00:09 +01:00
/// A partition id, stored on 16 bits
2021-03-16 12:18:03 +01:00
pub type Partition = u16;
2021-03-05 16:22:29 +01:00
// TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump
// it up to 10.
// Maximum value : 16
2021-03-22 00:00:09 +01:00
/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
/// presence of numerous nodes, but exponentially bigger ring. Max 16
2021-03-05 16:22:29 +01:00
pub const PARTITION_BITS: usize = 8;
const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
// TODO: make this constant paraetrizable in the config file
// (most deployments use a replication factor of 3, so...)
2021-03-22 00:00:09 +01:00
/// The maximum number of time an object might get replicated
2021-03-05 16:22:29 +01:00
pub const MAX_REPLICATION: usize = 3;
/// The user-defined configuration of the cluster's nodes
2021-02-21 13:11:10 +01:00
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfig {
2021-03-22 00:00:09 +01:00
/// Map of each node's id to it's configuration
2021-02-21 13:11:10 +01:00
pub members: HashMap<UUID, NetworkConfigEntry>,
2021-03-22 00:00:09 +01:00
/// Version of this config
2021-02-21 13:11:10 +01:00
pub version: u64,
}
impl NetworkConfig {
pub(crate) fn new() -> Self {
2021-02-23 18:46:25 +01:00
Self {
2021-02-21 13:11:10 +01:00
members: HashMap::new(),
version: 0,
}
}
}
2021-03-22 00:00:09 +01:00
/// The overall configuration of one (possibly remote) node
2021-02-21 13:11:10 +01:00
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfigEntry {
2021-03-22 00:00:09 +01:00
/// Datacenter at which this entry belong. This infromation might be used to perform a better
/// geodistribution
2021-02-21 13:11:10 +01:00
pub datacenter: String,
2021-03-22 00:00:09 +01:00
/// The (relative) capacity of the node
2021-03-10 14:52:03 +01:00
pub capacity: u32,
2021-03-22 00:00:09 +01:00
/// A tag to recognize the entry, not used for other things than display
2021-02-21 13:11:10 +01:00
pub tag: String,
}
2021-03-22 00:00:09 +01:00
/// A ring distributing fairly objects to nodes
2021-02-21 13:11:10 +01:00
#[derive(Clone)]
pub struct Ring {
2021-03-22 00:00:09 +01:00
/// The network configuration used to generate this ring
2021-02-21 13:11:10 +01:00
pub config: NetworkConfig,
2021-03-22 00:00:09 +01:00
/// The list of entries in the ring
2021-02-21 13:11:10 +01:00
pub ring: Vec<RingEntry>,
}
2021-03-22 00:00:09 +01:00
/// An entry in the ring
2021-02-21 13:11:10 +01:00
#[derive(Clone, Debug)]
pub struct RingEntry {
2021-03-22 00:00:09 +01:00
/// The prefix of the Hash of object which should use this entry
2021-02-21 13:11:10 +01:00
pub location: Hash,
2021-03-22 00:00:09 +01:00
/// The nodes in which a matching object should get stored
2021-03-05 16:22:29 +01:00
pub nodes: [UUID; MAX_REPLICATION],
2021-02-21 13:11:10 +01:00
}
impl Ring {
2021-03-22 00:00:09 +01:00
// TODO this function MUST be refactored, it's 100 lines long, with a 50 lines loop, going up to 6
// levels of imbrication. It is basically impossible to test, maintain, or understand for an
// outsider.
2021-03-05 16:22:29 +01:00
pub(crate) fn new(config: NetworkConfig) -> Self {
// Create a vector of partition indices (0 to 2**PARTITION_BITS-1)
let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>();
let datacenters = config
.members
.iter()
.map(|(_id, info)| info.datacenter.as_str())
.collect::<HashSet<&str>>();
let n_datacenters = datacenters.len();
// Prepare ring
let mut partitions: Vec<Vec<(&UUID, &NetworkConfigEntry)>> = partitions_idx
.iter()
.map(|_i| Vec::new())
.collect::<Vec<_>>();
// Create MagLev priority queues for each node
let mut queues = config
.members
.iter()
.map(|(node_id, node_info)| {
let mut parts = partitions_idx
.iter()
.map(|i| {
let part_data =
[&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat();
(*i, fasthash(&part_data[..]))
})
.collect::<Vec<_>>();
parts.sort_by_key(|(_i, h)| *h);
let parts_i = parts.iter().map(|(i, _h)| *i).collect::<Vec<_>>();
(node_id, node_info, parts_i, 0)
})
.collect::<Vec<_>>();
2021-03-10 14:52:03 +01:00
let max_capacity = config
2021-03-05 16:22:29 +01:00
.members
.iter()
2021-03-10 14:52:03 +01:00
.map(|(_, node_info)| node_info.capacity)
2021-03-05 16:22:29 +01:00
.fold(0, std::cmp::max);
// Fill up ring
for rep in 0..MAX_REPLICATION {
queues.sort_by_key(|(ni, _np, _q, _p)| {
let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat();
fasthash(&queue_data[..])
});
for (_, _, _, pos) in queues.iter_mut() {
*pos = 0;
2021-02-21 13:11:10 +01:00
}
2021-03-05 16:22:29 +01:00
let mut remaining = partitions_idx.len();
while remaining > 0 {
let remaining0 = remaining;
2021-03-10 14:52:03 +01:00
for i_round in 0..max_capacity {
2021-03-05 16:22:29 +01:00
for (node_id, node_info, q, pos) in queues.iter_mut() {
2021-03-10 14:52:03 +01:00
if i_round >= node_info.capacity {
2021-03-05 16:22:29 +01:00
continue;
}
for pos2 in *pos..q.len() {
let qv = q[pos2];
if partitions[qv].len() != rep {
continue;
}
let p_dcs = partitions[qv]
.iter()
.map(|(_id, info)| info.datacenter.as_str())
.collect::<HashSet<&str>>();
2021-03-05 17:08:03 +01:00
if (p_dcs.len() < n_datacenters
&& !p_dcs.contains(&node_info.datacenter.as_str()))
2021-03-05 16:22:29 +01:00
|| (p_dcs.len() == n_datacenters
&& !partitions[qv].iter().any(|(id, _i)| id == node_id))
{
partitions[qv].push((node_id, node_info));
remaining -= 1;
*pos = pos2 + 1;
break;
}
}
}
}
if remaining == remaining0 {
// No progress made, exit
warn!("Could not build ring, not enough nodes configured.");
return Self {
config,
ring: vec![],
};
}
2021-02-21 13:11:10 +01:00
}
}
2021-03-05 16:22:29 +01:00
let ring = partitions
.iter()
.enumerate()
.map(|(i, nodes)| {
let top = (i as u16) << (16 - PARTITION_BITS);
let mut hash = [0u8; 32];
hash[0..2].copy_from_slice(&u16::to_be_bytes(top)[..]);
let nodes = nodes.iter().map(|(id, _info)| **id).collect::<Vec<UUID>>();
RingEntry {
location: hash.into(),
nodes: nodes.try_into().unwrap(),
}
})
.collect::<Vec<_>>();
2021-02-21 13:11:10 +01:00
2021-03-05 16:22:29 +01:00
Self { config, ring }
2021-02-21 13:11:10 +01:00
}
2021-03-22 00:00:09 +01:00
/// Get the partition in which data would fall on
2021-03-16 12:18:03 +01:00
pub fn partition_of(&self, from: &Hash) -> Partition {
2021-03-16 11:14:27 +01:00
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
2021-04-06 05:25:28 +02:00
/// Get the list of partitions and the first hash of a partition key that would fall in it
2021-03-16 12:18:03 +01:00
pub fn partitions(&self) -> Vec<(Partition, Hash)> {
let mut ret = vec![];
for (i, entry) in self.ring.iter().enumerate() {
ret.push((i as u16, entry.location));
}
if ret.len() > 0 {
assert_eq!(ret[0].1, [0u8; 32].into());
}
ret
}
2021-04-06 05:25:28 +02:00
// TODO rename this function as it no longer walk the ring
2021-03-22 00:00:09 +01:00
/// Walk the ring to find the n servers in which data should be replicated
2021-03-05 16:22:29 +01:00
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if self.ring.len() != 1 << PARTITION_BITS {
2021-03-10 21:50:09 +01:00
warn!("Ring not yet ready, read/writes will be lost!");
2021-03-05 16:22:29 +01:00
return vec![];
2021-02-21 13:11:10 +01:00
}
2021-03-05 16:22:29 +01:00
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
2021-03-22 00:00:09 +01:00
// TODO why computing two time in the same way and asserting?
2021-03-16 11:14:27 +01:00
assert_eq!(partition_idx, self.partition_of(from) as usize);
2021-03-05 16:22:29 +01:00
let partition = &self.ring[partition_idx];
2021-02-21 13:11:10 +01:00
2021-03-05 16:22:29 +01:00
let partition_top =
u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap());
2021-03-22 00:00:09 +01:00
// TODO is this an assertion on the validity of PARTITION_MASK_U16? If so, it should
// probably be a test more than a runtime assertion
assert_eq!(partition_top & PARTITION_MASK_U16, top & PARTITION_MASK_U16);
2021-02-21 13:11:10 +01:00
2021-03-05 16:22:29 +01:00
assert!(n <= partition.nodes.len());
partition.nodes[..n].iter().cloned().collect::<Vec<_>>()
2021-02-21 13:11:10 +01:00
}
}