From fcf9ac674a2842b2b55d933e60af5af93dcc4592 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:19:25 +0200 Subject: [PATCH] Tests written in layout.rs added staged_parameters to ClusterLayout removed the serde(default) -> will need a migration function --- src/db/lib.rs | 2 +- src/garage/cli/layout.rs | 4 +- src/rpc/graph_algo.rs | 14 --- src/rpc/layout.rs | 228 ++++++++++++++++++--------------------- 4 files changed, 105 insertions(+), 143 deletions(-) diff --git a/src/db/lib.rs b/src/db/lib.rs index d96586be4..af5394944 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -3,7 +3,7 @@ extern crate tracing; #[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); +//compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); #[cfg(feature = "lmdb")] pub mod lmdb_adapter; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 9e5bdaead..32f637ebc 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -190,7 +190,7 @@ pub async fn cmd_show_layout( println!(); println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!("Zone redundancy: {}", layout.parameters.get().zone_redundancy); + println!("Zone redundancy: {}", layout.staged_parameters.get().zone_redundancy); println!(); // this will print the stats of what partitions @@ -270,7 +270,7 @@ pub async fn cmd_config_layout( println!("The zone redundancy must be at least 1."); } else { - layout.parameters.update(LayoutParameters{ zone_redundancy: r }); + layout.staged_parameters.update(LayoutParameters{ zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } } diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 4e27631aa..70ccf35ad 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -419,17 +419,3 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { } -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== - - -#[cfg(test)] -mod tests { - use super::*; - -} - diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 1969b7210..976f94af6 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -30,8 +30,8 @@ pub struct ClusterLayout { //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_partition_size")] pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -49,20 +49,11 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - #[serde(default="default_layout_parameters")] - pub parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_partition_size() -> u32{ - 0 -} - -fn default_layout_parameters() -> Lww{ - Lww::::new(LayoutParameters{ zone_redundancy: 1}) -} - ///This struct is used to set the parameters to be used in the assignation computation ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -124,8 +115,8 @@ impl ClusterLayout { //We set the default zone redundancy to be equal to the replication factor, //i.e. as strict as possible. - let default_parameters = Lww::::new( - LayoutParameters{ zone_redundancy: replication_factor}); + let parameters = LayoutParameters{ zone_redundancy: replication_factor}; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -137,7 +128,8 @@ impl ClusterLayout { roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters: default_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -150,8 +142,8 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.parameters.get() != other.parameters.get(); - self.parameters.merge(&other.parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); @@ -330,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; if zones_of_p.unique().count() < redundancy { return false; } @@ -384,7 +376,8 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ @@ -417,13 +410,15 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {}.", partition_size, self.partition_size)); + be {} (the zone redundancy was {}).", partition_size, self.partition_size, + self.parameters.zone_redundancy)); } else { msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ @@ -511,6 +506,10 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); + } Ok(Some(old_assignation)) } @@ -585,7 +584,7 @@ impl ClusterLayout { self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; @@ -800,96 +799,80 @@ impl ClusterLayout { #[cfg(test)] mod tests { - use super::*; - use std::io::*; -// use itertools::Itertools; -/* - fn check_assignation(cl: &ClusterLayout) { - //Check that input data has the right format - let nb_partitions = 1usize << PARTITION_BITS; - assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); + use super::{*,Error}; + use std::cmp::min; - //Check that is is a correct assignation with zone redundancy - let rf = cl.replication_factor; - for i in 0..nb_partitions { - assert!( - rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] - .iter() - .map(|nod| node_zone[*nod as usize].clone()) - .unique() - .count() - ); - } - let nb_nodes = cl.node_id_vec.len(); - //Check optimality - let node_nb_part = (0..nb_nodes) - .map(|i| { - cl.ring_assignation_data - .iter() - .filter(|x| **x == i as u8) - .count() - }) - .collect::>(); + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size +1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec - .iter() - .map(|z| { - cl.ring_assignation_data - .iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - }) - .collect::>(); + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition + let mut id_zone_token = vec![0; zones.len()]; + for (z,t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } - if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = (0..nb_nodes) - .filter(|i| { - if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { - zone_nb_part[p] < nb_partitions - } else { - false - } - }) - .max_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 || + (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); - if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = node_of_z_iter.min_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } - } - } -*/ - fn show_msg(msg : &Message) { for s in msg.iter(){ println!("{}",s); @@ -901,6 +884,7 @@ mod tests { node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, + zone_redundancy: usize ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -917,11 +901,11 @@ mod tests { ); cl.roles.merge(&update); } + cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); } #[test] fn test_assignation() { - std::io::stdout().flush().ok().expect("Could not flush stdout"); let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] @@ -929,22 +913,11 @@ mod tests { .map(|x| x.to_string()) .collect(); - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles: LwwMap::new(), - - replication_factor: 3, - zone_redundancy: 1, - partition_size: 0, - ring_assignation_data: vec![], - version: 0, - staging: LwwMap::new(), - staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + let mut cl = ClusterLayout::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -952,19 +925,22 @@ mod tests { .into_iter() .map(|x| x.to_string()) .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); } }