From 2e229d44303bfafa22aaf0d4aa299021a937220e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 12 Sep 2023 17:24:51 +0200 Subject: [PATCH 01/10] new layout: improve output display --- Cargo.lock | 1 + src/garage/cli/layout.rs | 89 +++++++++++++++++----------------------- src/rpc/Cargo.toml | 1 + src/rpc/layout.rs | 85 +++++++++++++++++++------------------- 4 files changed, 82 insertions(+), 94 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f4380c0..873cbce4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1370,6 +1370,7 @@ dependencies = [ "bytes", "bytesize", "err-derive", + "format_table", "futures", "futures-util", "garage_db", diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 3932f115..9bb90309 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -174,16 +174,12 @@ pub async fn cmd_show_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); - if !print_cluster_layout(&layout) { - println!("No nodes currently have a role in the cluster."); - println!("See `garage status` to view available nodes."); - } + print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); println!(); println!("Current cluster layout version: {}", layout.version); let has_role_changes = print_staging_role_changes(&layout); - let has_param_changes = print_staging_parameters_changes(&layout); - if has_role_changes || has_param_changes { + if has_role_changes { let v = layout.version; let res_apply = layout.apply_staged_changes(Some(v + 1)); @@ -193,9 +189,7 @@ pub async fn cmd_show_layout( Ok((layout, msg)) => { println!(); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - if !print_cluster_layout(&layout) { - println!("No nodes have a role in the new layout."); - } + print_cluster_layout(&layout, "No nodes have a role in the new layout."); println!(); for line in msg.iter() { @@ -326,7 +320,7 @@ pub async fn send_layout( Ok(()) } -pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { +pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { @@ -356,61 +350,54 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { )); }; } - println!(); - println!("Parameters of the layout computation:"); - println!("Zone redundancy: {}", layout.parameters.zone_redundancy); - println!(); - if table.len() == 1 { - false - } else { + if table.len() > 1 { format_table(table); - true + } else { + println!("{}", empty_msg); } -} - -pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { - let has_changes = *layout.staging_parameters.get() != layout.parameters; - if has_changes { - println!(); - println!("==== NEW LAYOUT PARAMETERS ===="); - println!( - "Zone redundancy: {}", - layout.staging_parameters.get().zone_redundancy - ); - println!(); - } - has_changes + println!(); + println!("Zone redundancy: {}", layout.parameters.zone_redundancy); } pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { - let has_changes = layout + let has_role_changes = layout .staging_roles .items() .iter() .any(|(k, _, v)| layout.roles.get(k) != Some(v)); + let has_layout_changes = *layout.staging_parameters.get() != layout.parameters; - if has_changes { + if has_role_changes || has_layout_changes { println!(); println!("==== STAGED ROLE CHANGES ===="); - let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging_roles.items().iter() { - if layout.roles.get(id) == Some(role) { - continue; - } - if let Some(role) = &role.0 { - let tags = role.tags.join(","); - table.push(format!( - "{:?}\t{}\t{}\t{}", - id, - tags, - role.zone, - role.capacity_string() - )); - } else { - table.push(format!("{:?}\tREMOVED", id)); + if has_role_changes { + let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; + for (id, _, role) in layout.staging_roles.items().iter() { + if layout.roles.get(id) == Some(role) { + continue; + } + if let Some(role) = &role.0 { + let tags = role.tags.join(","); + table.push(format!( + "{:?}\t{}\t{}\t{}", + id, + tags, + role.zone, + role.capacity_string() + )); + } else { + table.push(format!("{:?}\tREMOVED", id)); + } } + format_table(table); + println!(); + } + if has_layout_changes { + println!( + "Zone redundancy: {}", + layout.staging_parameters.get().zone_redundancy + ); } - format_table(table); true } else { false diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 8ccee46f..2f22cd28 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -14,6 +14,7 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +format_table.workspace = true garage_db.workspace = true garage_util.workspace = true diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index c2655e59..76d29b08 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -585,16 +585,16 @@ impl ClusterLayout { // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + msg.push("".into()); if old_assignment_opt != None { msg.push(format!( - "Optimal size of a partition: {} (was {} in the previous layout).", + "Optimal partition size: {} ({} in previous layout)", ByteSize::b(partition_size).to_string_as(false), ByteSize::b(self.partition_size).to_string_as(false) )); } else { msg.push(format!( - "Given the replication and redundancy constraints, the \ - optimal size of a partition is {}.", + "Optimal partition size: {}", ByteSize::b(partition_size).to_string_as(false) )); } @@ -618,7 +618,6 @@ impl ClusterLayout { // We display statistics of the computation msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); - msg.push("".to_string()); // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; @@ -931,29 +930,33 @@ impl ClusterLayout { let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); - msg.push("".into()); msg.push(format!( - "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", + "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", ByteSize::b(used_cap).to_string_as(false), ByteSize::b(total_cap).to_string_as(false), percent_cap )); - msg.push("".into()); - msg.push( - "If the percentage is too low, it might be that the \ - replication/redundancy constraints force the use of nodes/zones with small \ - storage capacities. \ - You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones." - .into(), - ); msg.push(format!( - "Recall that because of the replication factor, the actual available \ - storage capacity is {} / {} = {}.", - ByteSize::b(used_cap).to_string_as(false), + "Effective capacity (replication factor {}): {}", self.replication_factor, ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) )); + if percent_cap < 80. { + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + replication/redundancy constraints force the use of nodes/zones with small \ + storage capacities." + .into(), + ); + msg.push( + "You might want to rebalance the storage capacities or relax the constraints." + .into(), + ); + msg.push( + "See the detailed statistics below and look for saturated nodes/zones.".into(), + ); + } // We define and fill in the following tables let storing_nodes = self.nongateway_nodes(); @@ -1007,10 +1010,10 @@ impl ClusterLayout { transferred.", total_new_partitions )); + msg.push("".into()); } - msg.push("".into()); - msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); + let mut table = vec![]; for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::::new(); for n in 0..storing_nodes.len() { @@ -1020,15 +1023,9 @@ impl ClusterLayout { } let replicated_partitions: usize = nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); - msg.push("".into()); - - msg.push(format!( - "Zone {}: {} distinct partitions stored ({} new, \ - {} partition copies) ", - id_to_zone[z], - stored_partitions_zone[z], - new_partitions_zone[z], - replicated_partitions + table.push(format!( + "{}\tTags\tPartitions\tCapacity\tUsable capacity", + id_to_zone[z] )); let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; @@ -1037,33 +1034,35 @@ impl ClusterLayout { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); - msg.push(format!( - " Usable capacity / Total capacity: {} / {} ({:.1}%).", - ByteSize::b(available_cap_z).to_string_as(false), - ByteSize::b(total_cap_z).to_string_as(false), - percent_cap_z - )); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self - .node_role(&self.node_id_vec[*n]) - .ok_or("Node not found."))? - .tags_string(); - msg.push(format!( - " Node {:?}: {} partitions ({} new) ; \ - usable/total capacity: {} / {} ({:.1}%) ; tags:{}", + let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); + table.push(format!( + " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", self.node_id_vec[*n], + tags_n, stored_partitions[*n], new_partitions[*n], ByteSize::b(available_cap_n).to_string_as(false), ByteSize::b(total_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, - tags_n )); } + + table.push(format!( + " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", + replicated_partitions, + stored_partitions_zone[z], + //new_partitions_zone[z], + ByteSize::b(available_cap_z).to_string_as(false), + ByteSize::b(total_cap_z).to_string_as(false), + percent_cap_z + )); + table.push("".into()); } + msg.push(format_table::format_table_to_string(table)); Ok(msg) } From 015ccb39aa511c72d0c899713a828491871da3e7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 11:57:36 +0200 Subject: [PATCH 02/10] new layout: make zone_redundancy optionnal (if not set, is maximum) --- src/garage/cli/layout.rs | 37 ++++++----- src/garage/cli/structs.rs | 4 +- src/rpc/layout.rs | 131 ++++++++++++++++++++++++++++---------- 3 files changed, 120 insertions(+), 52 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 9bb90309..557549e2 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -261,28 +261,35 @@ pub async fn cmd_config_layout( let mut did_something = false; match config_opt.redundancy { None => (), - Some(r) => { - if r > layout.replication_factor { - println!( - "The zone redundancy must be smaller or equal to the \ - replication factor ({}).", - layout.replication_factor - ); - } else if r < 1 { - println!("The zone redundancy must be at least 1."); - } else { - layout - .staging_parameters - .update(LayoutParameters { zone_redundancy: r }); - println!("The new zone redundancy has been saved ({}).", r); + Some(r_str) => { + let r = r_str + .parse::() + .ok_or_message("invalid zone redundancy value")?; + if let ZoneRedundancy::AtLeast(r_int) = r { + if r_int > layout.replication_factor { + return Err(Error::Message(format!( + "The zone redundancy must be smaller or equal to the \ + replication factor ({}).", + layout.replication_factor + ))); + } else if r_int < 1 { + return Err(Error::Message( + "The zone redundancy must be at least 1.".into(), + )); + } } + + layout + .staging_parameters + .update(LayoutParameters { zone_redundancy: r }); + println!("The new zone redundancy has been saved ({}).", r); did_something = true; } } if !did_something { return Err(Error::Message( - "Please specify an action for `garage layout config` to do".into(), + "Please specify an action for `garage layout config`".into(), )); } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index fd37a24e..c4ebeb1a 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -143,9 +143,9 @@ pub struct RemoveRoleOpt { #[derive(StructOpt, Debug)] pub struct ConfigLayoutOpt { - /// Zone redundancy parameter + /// Zone redundancy parameter ('none'/'max' or integer) #[structopt(short = "r", long = "redundancy")] - pub(crate) redundancy: Option, + pub(crate) redundancy: Option, } #[derive(StructOpt, Debug)] diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 76d29b08..9aa9c584 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,6 +1,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; +use std::fmt; use bytesize::ByteSize; use itertools::Itertools; @@ -115,7 +116,16 @@ mod v09 { /// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] pub struct LayoutParameters { - pub zone_redundancy: usize, + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// If None, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, } impl garage_util::migrate::Migrate for ClusterLayout { @@ -125,7 +135,6 @@ mod v09 { fn migrate(previous: Self::Previous) -> Self { use itertools::Itertools; - use std::collections::HashSet; // In the old layout, capacities are in an arbitrary unit, // but in the new layout they are in bytes. @@ -152,17 +161,10 @@ mod v09 { .min() .unwrap_or(0); - // Determine zone redundancy parameter - let zone_redundancy = std::cmp::min( - previous.replication_factor, - roles - .items() - .iter() - .filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str())) - .collect::>() - .len(), - ); - let parameters = LayoutParameters { zone_redundancy }; + // By default, zone_redundancy is None (i.e. maximum possible value) + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; let mut res = Self { version: previous.version, @@ -224,13 +226,37 @@ impl NodeRole { } } +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + // Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be equal to the replication factor, - // i.e. as strict as possible. + // We set the default zone redundancy to be None, meaning that the maximum + // possible value will be used depending on the cluster topology let parameters = LayoutParameters { - zone_redundancy: replication_factor, + zone_redundancy: ZoneRedundancy::Maximum, }; let staging_parameters = Lww::::new(parameters.clone()); @@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show` Ok(total_capacity) } + /// Returns the effective value of the zone_redundancy parameter + fn effective_zone_redundancy(&self) -> usize { + match self.parameters.zone_redundancy { + ZoneRedundancy::AtLeast(v) => v, + ZoneRedundancy::Maximum => { + let n_zones = self + .roles + .items() + .iter() + .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) + .collect::>() + .len(); + std::cmp::min(n_zones, self.replication_factor) + } + } + } + /// Check a cluster layout for internal consistency /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error @@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } // Check that every partition is associated to distinct nodes + let zone_redundancy = self.effective_zone_redundancy(); let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); @@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show` .expect("Zone not found.") }) .collect::>(); - let redundancy = self.parameters.zone_redundancy; - if zones_of_p.iter().unique().count() < redundancy { + if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( "nodes of partition are in less than {} distinct zones", - redundancy + zone_redundancy )); } } @@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); - match cl2.compute_optimal_partition_size(&zone_to_id) { + match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { Ok(s) if s != self.partition_size => { return Err(format!( "partition_size ({}) is different than optimal value ({})", @@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } } +// ==================================================================================== + // Implementation of the ClusterLayout methods related to the assignment algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignment. @@ -549,13 +594,15 @@ impl ClusterLayout { // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; + let zone_redundancy = self.effective_zone_redundancy(); + let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("".into()); msg.push(format!( "Partitions are \ replicated {} times on at least {} distinct zones.", - self.replication_factor, self.parameters.zone_redundancy + self.replication_factor, zone_redundancy )); // We generate for once numerical ids for the zones of non gateway nodes, @@ -570,12 +617,12 @@ impl ClusterLayout { nb_nongateway_nodes, self.replication_factor ))); } - if id_to_zone.len() < self.parameters.zone_redundancy { + if id_to_zone.len() < zone_redundancy { return Err(Error::Message(format!( "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", id_to_zone.len(), - self.parameters.zone_redundancy + zone_redundancy ))); } @@ -583,7 +630,7 @@ impl ClusterLayout { // Capacities should be given in a unit so that partition size is at least 100. // In this case, integer rounding plays a marginal role in the percentages of // optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; msg.push("".into()); if old_assignment_opt != None { @@ -610,7 +657,8 @@ impl ClusterLayout { // We compute a first flow/assignment that is heuristically close to the previous // assignment - let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?; + let mut gflow = + self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; if let Some(assoc) = &old_assignment_opt { // We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; @@ -735,9 +783,10 @@ impl ClusterLayout { fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, + zone_redundancy: usize, ) -> Result { let empty_set = HashSet::<(usize, usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { return Err(Error::Message( @@ -750,7 +799,12 @@ impl ClusterLayout { let mut s_down = 1; let mut s_up = self.get_total_capacity()?; while s_down + 1 < s_up { - g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g = self.generate_flow_graph( + (s_down + s_up) / 2, + zone_to_id, + &empty_set, + zone_redundancy, + )?; g.compute_maximal_flow()?; if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { s_up = (s_down + s_up) / 2; @@ -789,18 +843,18 @@ impl ClusterLayout { partition_size: u64, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, + zone_redundancy: usize, ) -> Result, Error> { let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; g.add_edge( Vertex::Source, Vertex::Pdown(p), - (self.replication_factor - redundancy) as u64, + (self.replication_factor - zone_redundancy) as u64, )?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; @@ -829,6 +883,7 @@ impl ClusterLayout { &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, + zone_redundancy: usize, ) -> Result, Error> { // We list the (partition,node) associations that are not used in the // previous assignment @@ -846,7 +901,12 @@ impl ClusterLayout { } // We compute the best flow using only the edges used in the previous assignment - let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + let mut g = self.generate_flow_graph( + self.partition_size, + zone_to_id, + &exclude_edge, + zone_redundancy, + )?; g.compute_maximal_flow()?; // We add the excluded edges and compute the maximal flow with the full graph. @@ -997,7 +1057,7 @@ impl ClusterLayout { if *prev_assign_opt == None { new_partitions = stored_partitions.clone(); - new_partitions_zone = stored_partitions_zone.clone(); + //new_partitions_zone = stored_partitions_zone.clone(); } // We display the statistics @@ -1124,7 +1184,7 @@ mod tests { let mut curr_zone = 0; - let redundancy = cl.parameters.zone_redundancy; + let redundancy = cl.effective_zone_redundancy(); for replic in 0..cl.replication_factor { for p in 0..NB_PARTITIONS { @@ -1176,8 +1236,9 @@ mod tests { ); cl.staging_roles.merge(&update); } - cl.staging_parameters - .update(LayoutParameters { zone_redundancy }); + cl.staging_parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); cl.staging_hash = cl.calculate_staging_hash(); } From 749b4865d0a26c600fef79ab0456c827faafb9e8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 12:07:45 +0200 Subject: [PATCH 03/10] new layout: improve display and fix comments --- src/garage/cli/layout.rs | 6 +++--- src/rpc/layout.rs | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 557549e2..ce2b11e0 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -282,7 +282,7 @@ pub async fn cmd_config_layout( layout .staging_parameters .update(LayoutParameters { zone_redundancy: r }); - println!("The new zone redundancy has been saved ({}).", r); + println!("The zone redundancy parameter has been set to '{}'.", r); did_something = true; } } @@ -359,11 +359,11 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { } if table.len() > 1 { format_table(table); + println!(); + println!("Zone redundancy: {}", layout.parameters.zone_redundancy); } else { println!("{}", empty_msg); } - println!(); - println!("Zone redundancy: {}", layout.parameters.zone_redundancy); } pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 9aa9c584..c106114b 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -121,7 +121,7 @@ mod v09 { /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies /// of each partition on at least that number of different zones. - /// If None, copies will be stored on the maximum possible number of zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] pub enum ZoneRedundancy { AtLeast(usize), @@ -161,7 +161,7 @@ mod v09 { .min() .unwrap_or(0); - // By default, zone_redundancy is None (i.e. maximum possible value) + // By default, zone_redundancy is maximum possible value let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, }; @@ -253,7 +253,7 @@ impl core::str::FromStr for ZoneRedundancy { // Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be None, meaning that the maximum + // We set the default zone redundancy to be Maximum, meaning that the maximum // possible value will be used depending on the cluster topology let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, @@ -1005,12 +1005,12 @@ impl ClusterLayout { msg.push("".into()); msg.push( "If the percentage is too low, it might be that the \ - replication/redundancy constraints force the use of nodes/zones with small \ + cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ storage capacities." .into(), ); msg.push( - "You might want to rebalance the storage capacities or relax the constraints." + "You might want to move storage capacity between zones or relax the redundancy constraint." .into(), ); msg.push( @@ -1105,8 +1105,8 @@ impl ClusterLayout { tags_n, stored_partitions[*n], new_partitions[*n], - ByteSize::b(available_cap_n).to_string_as(false), ByteSize::b(total_cap_n).to_string_as(false), + ByteSize::b(available_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, )); } @@ -1116,8 +1116,8 @@ impl ClusterLayout { replicated_partitions, stored_partitions_zone[z], //new_partitions_zone[z], - ByteSize::b(available_cap_z).to_string_as(false), ByteSize::b(total_cap_z).to_string_as(false), + ByteSize::b(available_cap_z).to_string_as(false), percent_cap_z )); table.push("".into()); From 0088599f52f38ae9e00fe772a416150813e2470b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 12:17:07 +0200 Subject: [PATCH 04/10] new layout: fix clippy lints --- src/rpc/graph_algo.rs | 8 ++++---- src/rpc/layout.rs | 22 +++++++++++----------- src/rpc/system.rs | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 65450d64..0e88efc4 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -189,7 +189,7 @@ impl Graph { let mut fifo = VecDeque::new(); fifo.push_back((idsource, 0)); while let Some((id, lvl)) = fifo.pop_front() { - if level[id] == None { + if level[id].is_none() { // it means id has not yet been reached level[id] = Some(lvl); for edge in self.graph[id].iter() { @@ -199,7 +199,7 @@ impl Graph { } } } - if level[idsink] == None { + if level[idsink].is_none() { // There is no residual flow break; } @@ -383,7 +383,7 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { for t in 0..forest.len() { let mut id = t; // while we are on a valid undiscovered node - while time_of_discovery[id] == None { + while time_of_discovery[id].is_none() { time_of_discovery[id] = Some(t); if let Some(i) = forest[id] { id = i; @@ -391,7 +391,7 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { break; } } - if forest[id] != None && time_of_discovery[id] == Some(t) { + if forest[id].is_some() && time_of_discovery[id] == Some(t) { // We discovered an id that we explored at this iteration t. // It means we are on a cycle let mut cy = vec![id; 1]; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index c106114b..e02a180b 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -195,7 +195,7 @@ mod v09 { .. })) = role { - *cap = *cap * mul; + *cap *= mul; } new_roles.merge_raw(node, *ts, &role); } @@ -258,7 +258,7 @@ impl ClusterLayout { let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, }; - let staging_parameters = Lww::::new(parameters.clone()); + let staging_parameters = Lww::::new(parameters); let empty_lwwmap = LwwMap::new(); @@ -322,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging_roles); self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = self.staging_parameters.get().clone(); + self.parameters = *self.staging_parameters.get(); self.staging_roles.clear(); self.staging_hash = self.calculate_staging_hash(); @@ -351,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } self.staging_roles.clear(); - self.staging_parameters.update(self.parameters.clone()); + self.staging_parameters.update(self.parameters); self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -382,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { - Some(role) if role.capacity != None => result.push(*uuid), + Some(role) if role.capacity.is_some() => result.push(*uuid), _ => (), } } @@ -633,7 +633,7 @@ impl ClusterLayout { let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; msg.push("".into()); - if old_assignment_opt != None { + if old_assignment_opt.is_some() { msg.push(format!( "Optimal partition size: {} ({} in previous layout)", ByteSize::b(partition_size).to_string_as(false), @@ -692,7 +692,7 @@ impl ClusterLayout { .roles .items() .iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) .map(|(k, _, _)| *k) .collect(); @@ -708,7 +708,7 @@ impl ClusterLayout { .roles .items() .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) .map(|(k, _, _)| *k) .collect(); @@ -770,7 +770,7 @@ impl ClusterLayout { for uuid in self.nongateway_nodes().iter() { let r = self.node_role(uuid).unwrap(); - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); id_to_zone.push(r.zone.clone()); } @@ -1055,7 +1055,7 @@ impl ClusterLayout { } } - if *prev_assign_opt == None { + if prev_assign_opt.is_none() { new_partitions = stored_partitions.clone(); //new_partitions_zone = stored_partitions_zone.clone(); } @@ -1063,7 +1063,7 @@ impl ClusterLayout { // We display the statistics msg.push("".into()); - if *prev_assign_opt != None { + if prev_assign_opt.is_some() { let total_new_partitions: usize = new_partitions.iter().sum(); msg.push(format!( "A total of {} new copies of partitions need to be \ diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 8fba9580..7fc3c20c 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -668,7 +668,7 @@ impl System { let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { - if prev_layout_check && !layout.check().is_ok() { + if prev_layout_check && layout.check().is_err() { error!("New cluster layout is invalid, discarding."); return Err(Error::Message( "New cluster layout is invalid, discarding.".into(), @@ -724,7 +724,7 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = !self.ring.borrow().layout.check().is_ok(); + let not_configured = self.ring.borrow().layout.check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self From 013b026d566689a20f47ced69300ebc117a77171 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 12:18:56 +0200 Subject: [PATCH 05/10] update cargo.nix --- Cargo.nix | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Cargo.nix b/Cargo.nix index d6109ab5..591f60a5 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -7,13 +7,13 @@ args@{ "garage_db/default" "garage_util/default" "garage_rpc/default" + "format_table/default" "garage_table/default" "garage_block/default" "garage_model/default" "garage_api/default" "garage_web/default" "garage/default" - "format_table/default" "k2v-client/default" ], rustPackages, @@ -33,7 +33,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "7bef0004fa84feec502c75d50632d54202c272d56d2549fc09e2a356141685bb"; + nixifiedLockHash = "5df33eefe787762bf831e92c723c153faf8d5910332dcdf2fd941fe03be59936"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -60,13 +60,13 @@ in garage_db = rustPackages.unknown.garage_db."0.8.4"; garage_util = rustPackages.unknown.garage_util."0.8.4"; garage_rpc = rustPackages.unknown.garage_rpc."0.8.4"; + format_table = rustPackages.unknown.format_table."0.1.1"; garage_table = rustPackages.unknown.garage_table."0.8.4"; garage_block = rustPackages.unknown.garage_block."0.8.4"; garage_model = rustPackages.unknown.garage_model."0.8.4"; garage_api = rustPackages.unknown.garage_api."0.8.4"; garage_web = rustPackages.unknown.garage_web."0.8.4"; garage = rustPackages.unknown.garage."0.8.4"; - format_table = rustPackages.unknown.format_table."0.1.1"; k2v-client = rustPackages.unknown.k2v-client."0.0.4"; }; "registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec { @@ -1954,6 +1954,7 @@ in bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out; bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out; ${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; + format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out; From 3ecc17f8c5c822ac5785e8a9fef34caf1a3312a2 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Sep 2023 11:21:35 +0200 Subject: [PATCH 06/10] new layout: use deterministic randomness for reproducible results --- src/rpc/graph_algo.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 0e88efc4..d8c6c9b9 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -1,7 +1,7 @@ //! This module deals with graph algorithms. //! It is used in layout.rs to build the partition to node assignment. -use rand::prelude::SliceRandom; +use rand::prelude::{SeedableRng, SliceRandom}; use std::cmp::{max, min}; use std::collections::HashMap; use std::collections::VecDeque; @@ -143,7 +143,11 @@ impl Graph { /// This function shuffles the order of the edge lists. It keeps the ids of the /// reversed edges consistent. fn shuffle_edges(&mut self) { - let mut rng = rand::thread_rng(); + // We use deterministic randomness so that the layout calculation algorihtm + // will output the same thing every time it is run. This way, the results + // pre-calculated in `garage layout show` will match exactly those used + // in practice with `garage layout apply` + let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]); for i in 0..self.graph.len() { self.graph[i].shuffle(&mut rng); // We need to update the ids of the reverse edges. From b4a0e636d830c7193968aaa1ec894e27596a1963 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Sep 2023 09:49:07 +0200 Subject: [PATCH 07/10] new layout doc: add examples of unexpected layout, to explain --- doc/book/operations/layout.md | 108 ++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index 5e314246..39cbcc1c 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -75,3 +75,111 @@ If you are using the `garage` CLI to script layout changes, follow the following - **Only call `garage layout apply` once**, and call it **strictly after** all of the `layout assign` and `layout remove` commands have returned. + + +## Understanding unexpected layout calculations + + +### Example 1 + +``` +$ garage layout show +==== CURRENT CLUSTER LAYOUT ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%) + +Zone redundancy: maximum + +Current cluster layout version: 6 + +==== STAGED ROLE CHANGES ==== +ID Tags Zone Capacity +a11c7cf18af29737 node4 dc1 1000.0 MB + + +==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%) +a11c7cf18af29737 node4 dc1 1000.0 MB 0 B (0.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%) + +Zone redundancy: maximum + +==== COMPUTATION OF A NEW PARTITION ASSIGNATION ==== + +Partitions are replicated 3 times on at least 3 distinct zones. + +Optimal partition size: 3.9 MB (3.9 MB in previous layout) +Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %) +Effective capacity (replication factor 3): 1000.0 MB + +A total of 0 new copies of partitions need to be transferred. + +dc1 Tags Partitions Capacity Usable capacity + b10c110e4e854e5a node1 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + a11c7cf18af29737 node4 0 (0 new) 1000.0 MB 0 B (0.0%) + TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%) + +dc2 Tags Partitions Capacity Usable capacity + a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) + +dc3 Tags Partitions Capacity Usable capacity + 62b218d848e86a64 node3 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) +``` + +### Example 2 + +``` +==== CURRENT CLUSTER LAYOUT ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 500.0 MB (50.0%) +a11c7cf18af29737 node4 dc1 1000.0 MB 500.0 MB (50.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%) + +Zone redundancy: maximum + +Current cluster layout version: 8 + +==== STAGED ROLE CHANGES ==== +ID Tags Zone Capacity +a11c7cf18af29737 node4 dc3 1000.0 MB + + +==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 753.9 MB (75.4%) +a11c7cf18af29737 node4 dc3 1000.0 MB 246.1 MB (24.6%) + +Zone redundancy: maximum + +==== COMPUTATION OF A NEW PARTITION ASSIGNATION ==== + +Partitions are replicated 3 times on at least 3 distinct zones. + +Optimal partition size: 3.9 MB (3.9 MB in previous layout) +Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %) +Effective capacity (replication factor 3): 1000.0 MB + +A total of 128 new copies of partitions need to be transferred. + +dc1 Tags Partitions Capacity Usable capacity + b10c110e4e854e5a node1 256 (128 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) + +dc2 Tags Partitions Capacity Usable capacity + a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) + +dc3 Tags Partitions Capacity Usable capacity + 62b218d848e86a64 node3 193 (0 new) 1000.0 MB 753.9 MB (75.4%) + a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%) + TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%) +``` From 405aa42b7dae67adbb32fd0384534bb8bad555ad Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Sep 2023 10:00:01 +0200 Subject: [PATCH 08/10] layout doc: update old text --- doc/book/operations/layout.md | 38 ++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index 39cbcc1c..93d502a7 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -9,18 +9,30 @@ a certain capacity, or a gateway node that does not store data and is only used as an API entry point for faster cluster access. An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page. +In Garage, all of the data that can be stored in a given cluster is divided +into slices which we call *partitions*. Each partition is stored by +one or several nodes in the cluster +(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)). +The layout determines the correspondence between these partition, +which exist on a logical level, and actual storage nodes. + ## How cluster layouts work in Garage -In Garage, a cluster layout is composed of the following components: +A cluster layout is composed of the following components: -- a table of roles assigned to nodes +- a table of roles assigned to nodes, defined by the user +- an optimal assignation of partitions to nodes, computed by an algorithm that is ran once when calling `garage layout apply` or the ApplyClusterLayout API endpoint - a version number Garage nodes will always use the cluster layout with the highest version number. Garage nodes also maintain and synchronize between them a set of proposed role changes that haven't yet been applied. These changes will be applied (or -canceled) in the next version of the layout +canceled) in the next version of the layout. + +All operations on the layout can be realized using the `garage` CLI or using the +[administration API endpoint](@/documentation/reference-manual/admin-api.md). +We give here a description of CLI commands, the admin API semantics are very similar. The following commands insert modifications to the set of proposed role changes for the next layout version (but they do not create the new layout immediately): @@ -51,7 +63,7 @@ commands will fail otherwise. ## Warnings about Garage cluster layout management -**Warning: never make several calls to `garage layout apply` or `garage layout +**⚠️ Never make several calls to `garage layout apply` or `garage layout revert` with the same value of the `--version` flag. Doing so can lead to the creation of several different layouts with the same version number, in which case your Garage cluster will become inconsistent until fixed.** If a call to @@ -65,16 +77,18 @@ shell, you shouldn't have much issues as long as you run commands one after the other and take care of checking the output of `garage layout show` before applying any changes. -If you are using the `garage` CLI to script layout changes, follow the following recommendations: +If you are using the `garage` CLI or the admin API to script layout changes, +follow the following recommendations: -- Make all of your `garage` CLI calls to the same RPC host. Do not use the - `garage` CLI to connect to individual nodes to send them each a piece of the - layout changes you are making, as the changes propagate asynchronously - between nodes and might not all be taken into account at the time when the - new layout is applied. +- If using the CLI, make all of your `garage` CLI calls to the same RPC host. + If using the admin API, make all of your API calls to the same Garage node. Do + not connect to individual nodes to send them each a piece of the layout changes + you are making, as the changes propagate asynchronously between nodes and might + not all be taken into account at the time when the new layout is applied. -- **Only call `garage layout apply` once**, and call it **strictly after** all - of the `layout assign` and `layout remove` commands have returned. +- **Only call `garage layout apply`/ApplyClusterLayout once**, and call it + **strictly after** all of the `layout assign` and `layout remove` + commands/UpdateClusterLayout API calls have returned. ## Understanding unexpected layout calculations From 8d07888fa2da8f72aa1e7e63bfd9ac9eb3b1e6ab Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Sep 2023 16:07:46 +0200 Subject: [PATCH 09/10] layout doc: write explanations for bizarre scenarios --- doc/book/operations/layout.md | 74 +++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index 93d502a7..0eb0e215 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -93,9 +93,23 @@ follow the following recommendations: ## Understanding unexpected layout calculations +When adding, removing or modifying nodes in a cluster layout, sometimes +unexpected assigntations of partitions to node can occure. These assignations +are in fact normal and logical, given the objectives of the algorihtm. Indeed, +**the layout algorithm prioritizes moving less data between nodes over the fact +of achieving equal distribution of load**. This section presents two examples +and illustrates how one can control Garage's behavior to obtain the desired +results. ### Example 1 +In this example, a cluster is originally composed of 3 nodes in 3 different +zones (data centers). The three nodes are of equal capacity, therefore they +are all fully exploited and all store a copy of all of the data in the cluster. + +Then, a fourth node of the same size is added in the datacenter `dc1`. +As illustrated by the following, **Garage will by default not store any data on the new node**: + ``` $ garage layout show ==== CURRENT CLUSTER LAYOUT ==== @@ -146,8 +160,37 @@ dc3 Tags Partitions Capacity Usable capacity TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) ``` +While unexpected, this is logical because of the following facts: + +- storing some data on the new node does not help increase the total quantity + of data that can be stored on the cluster, as the two other zones (`dc2` and + `dc3`) still need to store a full copy of everything, and their capacity is + still the same; + +- there is therefore no need to move any data on the new node as this would be pointless; + +- moving data to the new node has a cost which the algorithm decides to not pay if not necessary. + +This distribution of data can however not be what the administrator wanted: if +they added a new node to `dc1`, it might be because the existing node is too +slow, and they wish to divide its load by half. In that case, what they need to +do to force Garage to distribute the data between the two nodes is to attribute +only half of the capacity to each node in `dc1` (in our example, 500M instead of 1G). +In that case, Garage would determine that to be able to store 1G in total, it +would need to store 500M on the old node and 500M on the added one. + + ### Example 2 +The following example is a slightly different scenario, where `dc1` had two +nodes that were used at 50%, and `dc2` and `dc3` each have one node that is +100% used. All node capacities are the same. + +Then, a node from `dc1` is moved into `dc3`. One could expect that the roles of +`dc1` and `dc3` would simply be swapped: the remaining node in `dc1` would be +used at 100%, and the two nodes now in `dc3` would be used at 50%. Instead, +this happens: + ``` ==== CURRENT CLUSTER LAYOUT ==== ID Tags Zone Capacity Usable capacity @@ -197,3 +240,34 @@ dc3 Tags Partitions Capacity Usable capacity a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%) TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%) ``` + +As we can see, the node that was moved to `dc3` (node4) is only used at 25% (approximatively), +whereas the node that was already in `dc3` (node3) is used at 75%. + +This can be explained by the following: + +- node1 will now be the only node remaining in `dc1`, thus it has to store all + of the data in the cluster. Since it was storing only half of it before, it has + to retrieve the other half from other nodes in the cluster. + +- The data which it does not have is entirely stored by the other node that was + in `dc1` and that is now in `dc3` (node4). There is also a copy of it on node2 + and node3 since both these nodes have a copy of everything. + +- node3 and node4 are the two nodes that will now be in a datacenter that is + under-utilized (`dc3`), this means that those are the two candidates from which + data can be removed to be moved to node1. + +- Garage will move data in equal proportions from all possible sources, in this + case it means that it will tranfer 25% of the entire data set from node3 to + node1 and another 25% from node4 to node1. + +This explains why node3 ends with 75% utilization (100% from before minus 25% +that is moved to node1), and node4 ends with 25% (50% from before minus 25% +that is moved to node1). + +This illustrates another principle of the layout computation: **if there is a +choice in moving data out of some nodes, then all links between pairs of nodes +are used in equal proportions** (this is approximately true, there is +randomness in the algorihtm to achieve this so there might be some small +fluctuations, as we see above). From 0e5925fff6d9b3147de4967e1963b4c785d0055f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Sep 2023 16:09:17 +0200 Subject: [PATCH 10/10] layout doc: reformulate --- doc/book/operations/layout.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index 0eb0e215..ece17ddb 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -94,12 +94,13 @@ follow the following recommendations: ## Understanding unexpected layout calculations When adding, removing or modifying nodes in a cluster layout, sometimes -unexpected assigntations of partitions to node can occure. These assignations +unexpected assigntations of partitions to node can occur. These assignations are in fact normal and logical, given the objectives of the algorihtm. Indeed, **the layout algorithm prioritizes moving less data between nodes over the fact -of achieving equal distribution of load**. This section presents two examples -and illustrates how one can control Garage's behavior to obtain the desired -results. +of achieving equal distribution of load. It also tries to use all links between +pairs of nodes in equal proportions when moving data.** This section presents +two examples and illustrates how one can control Garage's behavior to obtain +the desired results. ### Example 1 @@ -266,8 +267,8 @@ This explains why node3 ends with 75% utilization (100% from before minus 25% that is moved to node1), and node4 ends with 25% (50% from before minus 25% that is moved to node1). -This illustrates another principle of the layout computation: **if there is a -choice in moving data out of some nodes, then all links between pairs of nodes -are used in equal proportions** (this is approximately true, there is +This illustrates the second principle of the layout computation: **if there is +a choice in moving data out of some nodes, then all links between pairs of +nodes are used in equal proportions** (this is approximately true, there is randomness in the algorihtm to achieve this so there might be some small fluctuations, as we see above).