forked from Deuxfleurs/garage
Small change to partition assignation algorithm
This change helps ensure that nodes for each partition are spread over all datacenters, a property that wasn't ensured previously when going from a 2 DC deployment to a 3 DC deployment
This commit is contained in:
parent
ba6b56ae68
commit
a0ec6c49be
2 changed files with 37 additions and 9 deletions
|
@ -196,6 +196,15 @@ pub async fn cmd_apply_layout(
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
|
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||||
|
|
||||||
|
layout.roles.merge(&layout.staging);
|
||||||
|
|
||||||
|
if !layout.calculate_partition_assignation() {
|
||||||
|
return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
layout.staging.clear();
|
||||||
|
layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
|
||||||
|
|
||||||
match apply_opt.version {
|
match apply_opt.version {
|
||||||
None => {
|
None => {
|
||||||
println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
|
println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
|
||||||
|
@ -209,15 +218,6 @@ pub async fn cmd_apply_layout(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
layout.roles.merge(&layout.staging);
|
|
||||||
|
|
||||||
if !layout.calculate_partition_assignation() {
|
|
||||||
return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
|
|
||||||
}
|
|
||||||
|
|
||||||
layout.staging.clear();
|
|
||||||
layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
|
|
||||||
|
|
||||||
layout.version += 1;
|
layout.version += 1;
|
||||||
|
|
||||||
send_layout(rpc_cli, rpc_host, layout).await?;
|
send_layout(rpc_cli, rpc_host, layout).await?;
|
||||||
|
|
|
@ -172,12 +172,38 @@ impl ClusterLayout {
|
||||||
println!("Calculating updated partition assignation, this may take some time...");
|
println!("Calculating updated partition assignation, this may take some time...");
|
||||||
println!();
|
println!();
|
||||||
|
|
||||||
|
// Get old partition assignation
|
||||||
let old_partitions = self.parse_assignation_data();
|
let old_partitions = self.parse_assignation_data();
|
||||||
|
|
||||||
|
// Create new partition assignation starting from old one
|
||||||
let mut partitions = old_partitions.clone();
|
let mut partitions = old_partitions.clone();
|
||||||
|
|
||||||
|
// Cleanup steps in new partition assignation:
|
||||||
|
let min_keep_nodes_per_part = (self.replication_factor + 1) / 2;
|
||||||
for part in partitions.iter_mut() {
|
for part in partitions.iter_mut() {
|
||||||
|
// - remove from assignation nodes that don't have a role in the layout anymore
|
||||||
part.nodes
|
part.nodes
|
||||||
.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
|
.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
|
||||||
|
|
||||||
|
// - remove from assignation some nodes that are in the same datacenter
|
||||||
|
// if we can, so that the later steps can ensure datacenter variety
|
||||||
|
// as much as possible (but still under the constraint that each partition
|
||||||
|
// should not move from at least a certain number of nodes that is
|
||||||
|
// min_keep_nodes_per_part)
|
||||||
|
'rmloop: while part.nodes.len() > min_keep_nodes_per_part {
|
||||||
|
let mut zns_c = HashMap::<&str, usize>::new();
|
||||||
|
for (_id, info) in part.nodes.iter() {
|
||||||
|
*zns_c.entry(info.unwrap().zone.as_str()).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
for i in 0..part.nodes.len() {
|
||||||
|
if zns_c[part.nodes[i].1.unwrap().zone.as_str()] > 1 {
|
||||||
|
part.nodes.remove(i);
|
||||||
|
continue 'rmloop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// When nodes are removed, or when bootstraping an assignation from
|
// When nodes are removed, or when bootstraping an assignation from
|
||||||
|
@ -196,6 +222,8 @@ impl ClusterLayout {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
// Not enough nodes in cluster to build a correct assignation.
|
||||||
|
// Signal it by returning an error.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue