forked from Deuxfleurs/garage
Slight change and add comment to layout assignation algo
This commit is contained in:
parent
a0ec6c49be
commit
f2a504bf9b
1 changed files with 33 additions and 37 deletions
|
@ -175,47 +175,32 @@ impl ClusterLayout {
|
||||||
// Get old partition assignation
|
// Get old partition assignation
|
||||||
let old_partitions = self.parse_assignation_data();
|
let old_partitions = self.parse_assignation_data();
|
||||||
|
|
||||||
// Create new partition assignation starting from old one
|
// Start new partition assignation with nodes from old assignation where it is relevant
|
||||||
let mut partitions = old_partitions.clone();
|
let mut partitions = old_partitions
|
||||||
|
.iter()
|
||||||
// Cleanup steps in new partition assignation:
|
.map(|old_part| {
|
||||||
let min_keep_nodes_per_part = (self.replication_factor + 1) / 2;
|
let mut new_part = PartitionAss::new();
|
||||||
for part in partitions.iter_mut() {
|
for node in old_part.nodes.iter() {
|
||||||
// - remove from assignation nodes that don't have a role in the layout anymore
|
if let Some(role) = node.1 {
|
||||||
part.nodes
|
if role.capacity.is_some() {
|
||||||
.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
|
new_part.add(None, n_zones, node.0, role);
|
||||||
|
}
|
||||||
// - remove from assignation some nodes that are in the same datacenter
|
|
||||||
// if we can, so that the later steps can ensure datacenter variety
|
|
||||||
// as much as possible (but still under the constraint that each partition
|
|
||||||
// should not move from at least a certain number of nodes that is
|
|
||||||
// min_keep_nodes_per_part)
|
|
||||||
'rmloop: while part.nodes.len() > min_keep_nodes_per_part {
|
|
||||||
let mut zns_c = HashMap::<&str, usize>::new();
|
|
||||||
for (_id, info) in part.nodes.iter() {
|
|
||||||
*zns_c.entry(info.unwrap().zone.as_str()).or_insert(0) += 1;
|
|
||||||
}
|
|
||||||
for i in 0..part.nodes.len() {
|
|
||||||
if zns_c[part.nodes[i].1.unwrap().zone.as_str()] > 1 {
|
|
||||||
part.nodes.remove(i);
|
|
||||||
continue 'rmloop;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
new_part
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
break;
|
// In various cases, not enough nodes will have been added for all partitions
|
||||||
}
|
// in the step above (e.g. due to node removals, or new zones being added).
|
||||||
}
|
// Here we add more nodes to make a complete (but sub-optimal) assignation,
|
||||||
|
|
||||||
// When nodes are removed, or when bootstraping an assignation from
|
|
||||||
// scratch for a new cluster, the old partitions will have holes (or be empty).
|
|
||||||
// Here we add more nodes to make a complete (sub-optimal) assignation,
|
|
||||||
// using an initial partition assignation that is calculated using the multi-dc maglev trick
|
// using an initial partition assignation that is calculated using the multi-dc maglev trick
|
||||||
match self.initial_partition_assignation() {
|
match self.initial_partition_assignation() {
|
||||||
Some(initial_partitions) => {
|
Some(initial_partitions) => {
|
||||||
for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) {
|
for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) {
|
||||||
for (id, info) in ipart.nodes.iter() {
|
for (id, info) in ipart.nodes.iter() {
|
||||||
if part.nodes.len() < self.replication_factor {
|
if part.nodes.len() < self.replication_factor {
|
||||||
part.add(part.nodes.len() + 1, n_zones, id, info.unwrap());
|
part.add(None, n_zones, id, info.unwrap());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert!(part.nodes.len() == self.replication_factor);
|
assert!(part.nodes.len() == self.replication_factor);
|
||||||
|
@ -287,7 +272,7 @@ impl ClusterLayout {
|
||||||
let mut newpart = part.clone();
|
let mut newpart = part.clone();
|
||||||
|
|
||||||
newpart.nodes.remove(irm);
|
newpart.nodes.remove(irm);
|
||||||
if !newpart.add(newpart.nodes.len() + 1, n_zones, idadd, infoadd) {
|
if !newpart.add(None, n_zones, idadd, infoadd) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
assert!(newpart.nodes.len() == self.replication_factor);
|
assert!(newpart.nodes.len() == self.replication_factor);
|
||||||
|
@ -422,7 +407,7 @@ impl ClusterLayout {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (pos2, &qv) in q.iter().enumerate().skip(*pos) {
|
for (pos2, &qv) in q.iter().enumerate().skip(*pos) {
|
||||||
if partitions[qv].add(rep + 1, n_zones, node_id, node_info) {
|
if partitions[qv].add(Some(rep + 1), n_zones, node_id, node_info) {
|
||||||
remaining -= 1;
|
remaining -= 1;
|
||||||
*pos = pos2 + 1;
|
*pos = pos2 + 1;
|
||||||
break;
|
break;
|
||||||
|
@ -579,15 +564,26 @@ impl<'a> PartitionAss<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add is a key function in creating a PartitionAss, i.e. the list of nodes
|
||||||
|
// to which a partition is assigned. It tries to add a certain node id to the
|
||||||
|
// assignation, but checks that doing so is compatible with the NECESSARY
|
||||||
|
// condition that the partition assignation must be dispersed over different
|
||||||
|
// zones (datacenters) if enough zones exist. This is why it takes a n_zones
|
||||||
|
// parameter, which is the total number of zones that have existing nodes:
|
||||||
|
// if nodes in the assignation already cover all n_zones zones, then any node
|
||||||
|
// that is not yet in the assignation can be added. Otherwise, only nodes
|
||||||
|
// that are in a new zone can be added.
|
||||||
fn add(
|
fn add(
|
||||||
&mut self,
|
&mut self,
|
||||||
target_len: usize,
|
target_len: Option<usize>,
|
||||||
n_zones: usize,
|
n_zones: usize,
|
||||||
node: &'a Uuid,
|
node: &'a Uuid,
|
||||||
role: &'a NodeRole,
|
role: &'a NodeRole,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
if self.nodes.len() != target_len - 1 {
|
if let Some(tl) = target_len {
|
||||||
return false;
|
if self.nodes.len() != tl - 1 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let p_zns = self
|
let p_zns = self
|
||||||
|
|
Loading…
Reference in a new issue