Merge pull request 'New layout: fixes and UX improvements' (#634) from new-layout-ux into next
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
continuous-integration/drone/tag Build is passing
continuous-integration/drone Build is passing

Reviewed-on: #634
This commit is contained in:
Alex 2023-09-27 09:04:32 +00:00
commit aa7eadc799
9 changed files with 438 additions and 180 deletions

1
Cargo.lock generated
View file

@ -1370,6 +1370,7 @@ dependencies = [
"bytes", "bytes",
"bytesize", "bytesize",
"err-derive", "err-derive",
"format_table",
"futures", "futures",
"futures-util", "futures-util",
"garage_db", "garage_db",

View file

@ -7,13 +7,13 @@ args@{
"garage_db/default" "garage_db/default"
"garage_util/default" "garage_util/default"
"garage_rpc/default" "garage_rpc/default"
"format_table/default"
"garage_table/default" "garage_table/default"
"garage_block/default" "garage_block/default"
"garage_model/default" "garage_model/default"
"garage_api/default" "garage_api/default"
"garage_web/default" "garage_web/default"
"garage/default" "garage/default"
"format_table/default"
"k2v-client/default" "k2v-client/default"
], ],
rustPackages, rustPackages,
@ -33,7 +33,7 @@ args@{
ignoreLockHash, ignoreLockHash,
}: }:
let let
nixifiedLockHash = "7bef0004fa84feec502c75d50632d54202c272d56d2549fc09e2a356141685bb"; nixifiedLockHash = "5df33eefe787762bf831e92c723c153faf8d5910332dcdf2fd941fe03be59936";
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
lockHashIgnored = if ignoreLockHash lockHashIgnored = if ignoreLockHash
@ -60,13 +60,13 @@ in
garage_db = rustPackages.unknown.garage_db."0.8.4"; garage_db = rustPackages.unknown.garage_db."0.8.4";
garage_util = rustPackages.unknown.garage_util."0.8.4"; garage_util = rustPackages.unknown.garage_util."0.8.4";
garage_rpc = rustPackages.unknown.garage_rpc."0.8.4"; garage_rpc = rustPackages.unknown.garage_rpc."0.8.4";
format_table = rustPackages.unknown.format_table."0.1.1";
garage_table = rustPackages.unknown.garage_table."0.8.4"; garage_table = rustPackages.unknown.garage_table."0.8.4";
garage_block = rustPackages.unknown.garage_block."0.8.4"; garage_block = rustPackages.unknown.garage_block."0.8.4";
garage_model = rustPackages.unknown.garage_model."0.8.4"; garage_model = rustPackages.unknown.garage_model."0.8.4";
garage_api = rustPackages.unknown.garage_api."0.8.4"; garage_api = rustPackages.unknown.garage_api."0.8.4";
garage_web = rustPackages.unknown.garage_web."0.8.4"; garage_web = rustPackages.unknown.garage_web."0.8.4";
garage = rustPackages.unknown.garage."0.8.4"; garage = rustPackages.unknown.garage."0.8.4";
format_table = rustPackages.unknown.format_table."0.1.1";
k2v-client = rustPackages.unknown.k2v-client."0.0.4"; k2v-client = rustPackages.unknown.k2v-client."0.0.4";
}; };
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
@ -1954,6 +1954,7 @@ in
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out;
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out; bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; ${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out;

View file

@ -9,18 +9,30 @@ a certain capacity, or a gateway node that does not store data and is only
used as an API entry point for faster cluster access. used as an API entry point for faster cluster access.
An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page. An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page.
In Garage, all of the data that can be stored in a given cluster is divided
into slices which we call *partitions*. Each partition is stored by
one or several nodes in the cluster
(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)).
The layout determines the correspondence between these partition,
which exist on a logical level, and actual storage nodes.
## How cluster layouts work in Garage ## How cluster layouts work in Garage
In Garage, a cluster layout is composed of the following components: A cluster layout is composed of the following components:
- a table of roles assigned to nodes - a table of roles assigned to nodes, defined by the user
- an optimal assignation of partitions to nodes, computed by an algorithm that is ran once when calling `garage layout apply` or the ApplyClusterLayout API endpoint
- a version number - a version number
Garage nodes will always use the cluster layout with the highest version number. Garage nodes will always use the cluster layout with the highest version number.
Garage nodes also maintain and synchronize between them a set of proposed role Garage nodes also maintain and synchronize between them a set of proposed role
changes that haven't yet been applied. These changes will be applied (or changes that haven't yet been applied. These changes will be applied (or
canceled) in the next version of the layout canceled) in the next version of the layout.
All operations on the layout can be realized using the `garage` CLI or using the
[administration API endpoint](@/documentation/reference-manual/admin-api.md).
We give here a description of CLI commands, the admin API semantics are very similar.
The following commands insert modifications to the set of proposed role changes The following commands insert modifications to the set of proposed role changes
for the next layout version (but they do not create the new layout immediately): for the next layout version (but they do not create the new layout immediately):
@ -51,7 +63,7 @@ commands will fail otherwise.
## Warnings about Garage cluster layout management ## Warnings about Garage cluster layout management
**Warning: never make several calls to `garage layout apply` or `garage layout **⚠️ Never make several calls to `garage layout apply` or `garage layout
revert` with the same value of the `--version` flag. Doing so can lead to the revert` with the same value of the `--version` flag. Doing so can lead to the
creation of several different layouts with the same version number, in which creation of several different layouts with the same version number, in which
case your Garage cluster will become inconsistent until fixed.** If a call to case your Garage cluster will become inconsistent until fixed.** If a call to
@ -65,13 +77,198 @@ shell, you shouldn't have much issues as long as you run commands one after
the other and take care of checking the output of `garage layout show` the other and take care of checking the output of `garage layout show`
before applying any changes. before applying any changes.
If you are using the `garage` CLI to script layout changes, follow the following recommendations: If you are using the `garage` CLI or the admin API to script layout changes,
follow the following recommendations:
- Make all of your `garage` CLI calls to the same RPC host. Do not use the - If using the CLI, make all of your `garage` CLI calls to the same RPC host.
`garage` CLI to connect to individual nodes to send them each a piece of the If using the admin API, make all of your API calls to the same Garage node. Do
layout changes you are making, as the changes propagate asynchronously not connect to individual nodes to send them each a piece of the layout changes
between nodes and might not all be taken into account at the time when the you are making, as the changes propagate asynchronously between nodes and might
new layout is applied. not all be taken into account at the time when the new layout is applied.
- **Only call `garage layout apply` once**, and call it **strictly after** all - **Only call `garage layout apply`/ApplyClusterLayout once**, and call it
of the `layout assign` and `layout remove` commands have returned. **strictly after** all of the `layout assign` and `layout remove`
commands/UpdateClusterLayout API calls have returned.
## Understanding unexpected layout calculations
When adding, removing or modifying nodes in a cluster layout, sometimes
unexpected assigntations of partitions to node can occur. These assignations
are in fact normal and logical, given the objectives of the algorihtm. Indeed,
**the layout algorithm prioritizes moving less data between nodes over the fact
of achieving equal distribution of load. It also tries to use all links between
pairs of nodes in equal proportions when moving data.** This section presents
two examples and illustrates how one can control Garage's behavior to obtain
the desired results.
### Example 1
In this example, a cluster is originally composed of 3 nodes in 3 different
zones (data centers). The three nodes are of equal capacity, therefore they
are all fully exploited and all store a copy of all of the data in the cluster.
Then, a fourth node of the same size is added in the datacenter `dc1`.
As illustrated by the following, **Garage will by default not store any data on the new node**:
```
$ garage layout show
==== CURRENT CLUSTER LAYOUT ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
Zone redundancy: maximum
Current cluster layout version: 6
==== STAGED ROLE CHANGES ====
ID Tags Zone Capacity
a11c7cf18af29737 node4 dc1 1000.0 MB
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
a11c7cf18af29737 node4 dc1 1000.0 MB 0 B (0.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
Zone redundancy: maximum
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
Partitions are replicated 3 times on at least 3 distinct zones.
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
Effective capacity (replication factor 3): 1000.0 MB
A total of 0 new copies of partitions need to be transferred.
dc1 Tags Partitions Capacity Usable capacity
b10c110e4e854e5a node1 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
a11c7cf18af29737 node4 0 (0 new) 1000.0 MB 0 B (0.0%)
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
dc2 Tags Partitions Capacity Usable capacity
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
dc3 Tags Partitions Capacity Usable capacity
62b218d848e86a64 node3 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
```
While unexpected, this is logical because of the following facts:
- storing some data on the new node does not help increase the total quantity
of data that can be stored on the cluster, as the two other zones (`dc2` and
`dc3`) still need to store a full copy of everything, and their capacity is
still the same;
- there is therefore no need to move any data on the new node as this would be pointless;
- moving data to the new node has a cost which the algorithm decides to not pay if not necessary.
This distribution of data can however not be what the administrator wanted: if
they added a new node to `dc1`, it might be because the existing node is too
slow, and they wish to divide its load by half. In that case, what they need to
do to force Garage to distribute the data between the two nodes is to attribute
only half of the capacity to each node in `dc1` (in our example, 500M instead of 1G).
In that case, Garage would determine that to be able to store 1G in total, it
would need to store 500M on the old node and 500M on the added one.
### Example 2
The following example is a slightly different scenario, where `dc1` had two
nodes that were used at 50%, and `dc2` and `dc3` each have one node that is
100% used. All node capacities are the same.
Then, a node from `dc1` is moved into `dc3`. One could expect that the roles of
`dc1` and `dc3` would simply be swapped: the remaining node in `dc1` would be
used at 100%, and the two nodes now in `dc3` would be used at 50%. Instead,
this happens:
```
==== CURRENT CLUSTER LAYOUT ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 500.0 MB (50.0%)
a11c7cf18af29737 node4 dc1 1000.0 MB 500.0 MB (50.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
Zone redundancy: maximum
Current cluster layout version: 8
==== STAGED ROLE CHANGES ====
ID Tags Zone Capacity
a11c7cf18af29737 node4 dc3 1000.0 MB
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 753.9 MB (75.4%)
a11c7cf18af29737 node4 dc3 1000.0 MB 246.1 MB (24.6%)
Zone redundancy: maximum
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
Partitions are replicated 3 times on at least 3 distinct zones.
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
Effective capacity (replication factor 3): 1000.0 MB
A total of 128 new copies of partitions need to be transferred.
dc1 Tags Partitions Capacity Usable capacity
b10c110e4e854e5a node1 256 (128 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
dc2 Tags Partitions Capacity Usable capacity
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
dc3 Tags Partitions Capacity Usable capacity
62b218d848e86a64 node3 193 (0 new) 1000.0 MB 753.9 MB (75.4%)
a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%)
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
```
As we can see, the node that was moved to `dc3` (node4) is only used at 25% (approximatively),
whereas the node that was already in `dc3` (node3) is used at 75%.
This can be explained by the following:
- node1 will now be the only node remaining in `dc1`, thus it has to store all
of the data in the cluster. Since it was storing only half of it before, it has
to retrieve the other half from other nodes in the cluster.
- The data which it does not have is entirely stored by the other node that was
in `dc1` and that is now in `dc3` (node4). There is also a copy of it on node2
and node3 since both these nodes have a copy of everything.
- node3 and node4 are the two nodes that will now be in a datacenter that is
under-utilized (`dc3`), this means that those are the two candidates from which
data can be removed to be moved to node1.
- Garage will move data in equal proportions from all possible sources, in this
case it means that it will tranfer 25% of the entire data set from node3 to
node1 and another 25% from node4 to node1.
This explains why node3 ends with 75% utilization (100% from before minus 25%
that is moved to node1), and node4 ends with 25% (50% from before minus 25%
that is moved to node1).
This illustrates the second principle of the layout computation: **if there is
a choice in moving data out of some nodes, then all links between pairs of
nodes are used in equal proportions** (this is approximately true, there is
randomness in the algorihtm to achieve this so there might be some small
fluctuations, as we see above).

View file

@ -174,16 +174,12 @@ pub async fn cmd_show_layout(
let layout = fetch_layout(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== CURRENT CLUSTER LAYOUT ===="); println!("==== CURRENT CLUSTER LAYOUT ====");
if !print_cluster_layout(&layout) { print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
println!("No nodes currently have a role in the cluster.");
println!("See `garage status` to view available nodes.");
}
println!(); println!();
println!("Current cluster layout version: {}", layout.version); println!("Current cluster layout version: {}", layout.version);
let has_role_changes = print_staging_role_changes(&layout); let has_role_changes = print_staging_role_changes(&layout);
let has_param_changes = print_staging_parameters_changes(&layout); if has_role_changes {
if has_role_changes || has_param_changes {
let v = layout.version; let v = layout.version;
let res_apply = layout.apply_staged_changes(Some(v + 1)); let res_apply = layout.apply_staged_changes(Some(v + 1));
@ -193,9 +189,7 @@ pub async fn cmd_show_layout(
Ok((layout, msg)) => { Ok((layout, msg)) => {
println!(); println!();
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
if !print_cluster_layout(&layout) { print_cluster_layout(&layout, "No nodes have a role in the new layout.");
println!("No nodes have a role in the new layout.");
}
println!(); println!();
for line in msg.iter() { for line in msg.iter() {
@ -267,28 +261,35 @@ pub async fn cmd_config_layout(
let mut did_something = false; let mut did_something = false;
match config_opt.redundancy { match config_opt.redundancy {
None => (), None => (),
Some(r) => { Some(r_str) => {
if r > layout.replication_factor { let r = r_str
println!( .parse::<ZoneRedundancy>()
"The zone redundancy must be smaller or equal to the \ .ok_or_message("invalid zone redundancy value")?;
replication factor ({}).", if let ZoneRedundancy::AtLeast(r_int) = r {
layout.replication_factor if r_int > layout.replication_factor {
); return Err(Error::Message(format!(
} else if r < 1 { "The zone redundancy must be smaller or equal to the \
println!("The zone redundancy must be at least 1."); replication factor ({}).",
} else { layout.replication_factor
layout )));
.staging_parameters } else if r_int < 1 {
.update(LayoutParameters { zone_redundancy: r }); return Err(Error::Message(
println!("The new zone redundancy has been saved ({}).", r); "The zone redundancy must be at least 1.".into(),
));
}
} }
layout
.staging_parameters
.update(LayoutParameters { zone_redundancy: r });
println!("The zone redundancy parameter has been set to '{}'.", r);
did_something = true; did_something = true;
} }
} }
if !did_something { if !did_something {
return Err(Error::Message( return Err(Error::Message(
"Please specify an action for `garage layout config` to do".into(), "Please specify an action for `garage layout config`".into(),
)); ));
} }
@ -326,7 +327,7 @@ pub async fn send_layout(
Ok(()) Ok(())
} }
pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
for (id, _, role) in layout.roles.items().iter() { for (id, _, role) in layout.roles.items().iter() {
let role = match &role.0 { let role = match &role.0 {
@ -356,61 +357,54 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
)); ));
}; };
} }
println!(); if table.len() > 1 {
println!("Parameters of the layout computation:");
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
println!();
if table.len() == 1 {
false
} else {
format_table(table); format_table(table);
true
}
}
pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool {
let has_changes = *layout.staging_parameters.get() != layout.parameters;
if has_changes {
println!();
println!("==== NEW LAYOUT PARAMETERS ====");
println!(
"Zone redundancy: {}",
layout.staging_parameters.get().zone_redundancy
);
println!(); println!();
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
} else {
println!("{}", empty_msg);
} }
has_changes
} }
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
let has_changes = layout let has_role_changes = layout
.staging_roles .staging_roles
.items() .items()
.iter() .iter()
.any(|(k, _, v)| layout.roles.get(k) != Some(v)); .any(|(k, _, v)| layout.roles.get(k) != Some(v));
let has_layout_changes = *layout.staging_parameters.get() != layout.parameters;
if has_changes { if has_role_changes || has_layout_changes {
println!(); println!();
println!("==== STAGED ROLE CHANGES ===="); println!("==== STAGED ROLE CHANGES ====");
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; if has_role_changes {
for (id, _, role) in layout.staging_roles.items().iter() { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
if layout.roles.get(id) == Some(role) { for (id, _, role) in layout.staging_roles.items().iter() {
continue; if layout.roles.get(id) == Some(role) {
} continue;
if let Some(role) = &role.0 { }
let tags = role.tags.join(","); if let Some(role) = &role.0 {
table.push(format!( let tags = role.tags.join(",");
"{:?}\t{}\t{}\t{}", table.push(format!(
id, "{:?}\t{}\t{}\t{}",
tags, id,
role.zone, tags,
role.capacity_string() role.zone,
)); role.capacity_string()
} else { ));
table.push(format!("{:?}\tREMOVED", id)); } else {
table.push(format!("{:?}\tREMOVED", id));
}
} }
format_table(table);
println!();
}
if has_layout_changes {
println!(
"Zone redundancy: {}",
layout.staging_parameters.get().zone_redundancy
);
} }
format_table(table);
true true
} else { } else {
false false

View file

@ -143,9 +143,9 @@ pub struct RemoveRoleOpt {
#[derive(StructOpt, Debug)] #[derive(StructOpt, Debug)]
pub struct ConfigLayoutOpt { pub struct ConfigLayoutOpt {
/// Zone redundancy parameter /// Zone redundancy parameter ('none'/'max' or integer)
#[structopt(short = "r", long = "redundancy")] #[structopt(short = "r", long = "redundancy")]
pub(crate) redundancy: Option<usize>, pub(crate) redundancy: Option<String>,
} }
#[derive(StructOpt, Debug)] #[derive(StructOpt, Debug)]

View file

@ -14,6 +14,7 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
format_table.workspace = true
garage_db.workspace = true garage_db.workspace = true
garage_util.workspace = true garage_util.workspace = true

View file

@ -1,7 +1,7 @@
//! This module deals with graph algorithms. //! This module deals with graph algorithms.
//! It is used in layout.rs to build the partition to node assignment. //! It is used in layout.rs to build the partition to node assignment.
use rand::prelude::SliceRandom; use rand::prelude::{SeedableRng, SliceRandom};
use std::cmp::{max, min}; use std::cmp::{max, min};
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::VecDeque; use std::collections::VecDeque;
@ -143,7 +143,11 @@ impl Graph<FlowEdge> {
/// This function shuffles the order of the edge lists. It keeps the ids of the /// This function shuffles the order of the edge lists. It keeps the ids of the
/// reversed edges consistent. /// reversed edges consistent.
fn shuffle_edges(&mut self) { fn shuffle_edges(&mut self) {
let mut rng = rand::thread_rng(); // We use deterministic randomness so that the layout calculation algorihtm
// will output the same thing every time it is run. This way, the results
// pre-calculated in `garage layout show` will match exactly those used
// in practice with `garage layout apply`
let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]);
for i in 0..self.graph.len() { for i in 0..self.graph.len() {
self.graph[i].shuffle(&mut rng); self.graph[i].shuffle(&mut rng);
// We need to update the ids of the reverse edges. // We need to update the ids of the reverse edges.
@ -189,7 +193,7 @@ impl Graph<FlowEdge> {
let mut fifo = VecDeque::new(); let mut fifo = VecDeque::new();
fifo.push_back((idsource, 0)); fifo.push_back((idsource, 0));
while let Some((id, lvl)) = fifo.pop_front() { while let Some((id, lvl)) = fifo.pop_front() {
if level[id] == None { if level[id].is_none() {
// it means id has not yet been reached // it means id has not yet been reached
level[id] = Some(lvl); level[id] = Some(lvl);
for edge in self.graph[id].iter() { for edge in self.graph[id].iter() {
@ -199,7 +203,7 @@ impl Graph<FlowEdge> {
} }
} }
} }
if level[idsink] == None { if level[idsink].is_none() {
// There is no residual flow // There is no residual flow
break; break;
} }
@ -383,7 +387,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
for t in 0..forest.len() { for t in 0..forest.len() {
let mut id = t; let mut id = t;
// while we are on a valid undiscovered node // while we are on a valid undiscovered node
while time_of_discovery[id] == None { while time_of_discovery[id].is_none() {
time_of_discovery[id] = Some(t); time_of_discovery[id] = Some(t);
if let Some(i) = forest[id] { if let Some(i) = forest[id] {
id = i; id = i;
@ -391,7 +395,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
break; break;
} }
} }
if forest[id] != None && time_of_discovery[id] == Some(t) { if forest[id].is_some() && time_of_discovery[id] == Some(t) {
// We discovered an id that we explored at this iteration t. // We discovered an id that we explored at this iteration t.
// It means we are on a cycle // It means we are on a cycle
let mut cy = vec![id; 1]; let mut cy = vec![id; 1];

View file

@ -1,6 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt;
use bytesize::ByteSize; use bytesize::ByteSize;
use itertools::Itertools; use itertools::Itertools;
@ -115,7 +116,16 @@ mod v09 {
/// algorithm. It is stored as a Crdt. /// algorithm. It is stored as a Crdt.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct LayoutParameters { pub struct LayoutParameters {
pub zone_redundancy: usize, pub zone_redundancy: ZoneRedundancy,
}
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
/// of each partition on at least that number of different zones.
/// Otherwise, copies will be stored on the maximum possible number of zones.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub enum ZoneRedundancy {
AtLeast(usize),
Maximum,
} }
impl garage_util::migrate::Migrate for ClusterLayout { impl garage_util::migrate::Migrate for ClusterLayout {
@ -125,7 +135,6 @@ mod v09 {
fn migrate(previous: Self::Previous) -> Self { fn migrate(previous: Self::Previous) -> Self {
use itertools::Itertools; use itertools::Itertools;
use std::collections::HashSet;
// In the old layout, capacities are in an arbitrary unit, // In the old layout, capacities are in an arbitrary unit,
// but in the new layout they are in bytes. // but in the new layout they are in bytes.
@ -152,17 +161,10 @@ mod v09 {
.min() .min()
.unwrap_or(0); .unwrap_or(0);
// Determine zone redundancy parameter // By default, zone_redundancy is maximum possible value
let zone_redundancy = std::cmp::min( let parameters = LayoutParameters {
previous.replication_factor, zone_redundancy: ZoneRedundancy::Maximum,
roles };
.items()
.iter()
.filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str()))
.collect::<HashSet<&str>>()
.len(),
);
let parameters = LayoutParameters { zone_redundancy };
let mut res = Self { let mut res = Self {
version: previous.version, version: previous.version,
@ -193,7 +195,7 @@ mod v09 {
.. ..
})) = role })) = role
{ {
*cap = *cap * mul; *cap *= mul;
} }
new_roles.merge_raw(node, *ts, &role); new_roles.merge_raw(node, *ts, &role);
} }
@ -224,15 +226,39 @@ impl NodeRole {
} }
} }
impl fmt::Display for ZoneRedundancy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ZoneRedundancy::Maximum => write!(f, "maximum"),
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
}
}
}
impl core::str::FromStr for ZoneRedundancy {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
x => {
let v = x
.parse::<usize>()
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
Ok(ZoneRedundancy::AtLeast(v))
}
}
}
}
// Implementation of the ClusterLayout methods unrelated to the assignment algorithm. // Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
impl ClusterLayout { impl ClusterLayout {
pub fn new(replication_factor: usize) -> Self { pub fn new(replication_factor: usize) -> Self {
// We set the default zone redundancy to be equal to the replication factor, // We set the default zone redundancy to be Maximum, meaning that the maximum
// i.e. as strict as possible. // possible value will be used depending on the cluster topology
let parameters = LayoutParameters { let parameters = LayoutParameters {
zone_redundancy: replication_factor, zone_redundancy: ZoneRedundancy::Maximum,
}; };
let staging_parameters = Lww::<LayoutParameters>::new(parameters.clone()); let staging_parameters = Lww::<LayoutParameters>::new(parameters);
let empty_lwwmap = LwwMap::new(); let empty_lwwmap = LwwMap::new();
@ -296,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
self.roles.merge(&self.staging_roles); self.roles.merge(&self.staging_roles);
self.roles.retain(|(_, _, v)| v.0.is_some()); self.roles.retain(|(_, _, v)| v.0.is_some());
self.parameters = self.staging_parameters.get().clone(); self.parameters = *self.staging_parameters.get();
self.staging_roles.clear(); self.staging_roles.clear();
self.staging_hash = self.calculate_staging_hash(); self.staging_hash = self.calculate_staging_hash();
@ -325,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
} }
self.staging_roles.clear(); self.staging_roles.clear();
self.staging_parameters.update(self.parameters.clone()); self.staging_parameters.update(self.parameters);
self.staging_hash = self.calculate_staging_hash(); self.staging_hash = self.calculate_staging_hash();
self.version += 1; self.version += 1;
@ -356,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
let mut result = Vec::<Uuid>::new(); let mut result = Vec::<Uuid>::new();
for uuid in self.node_id_vec.iter() { for uuid in self.node_id_vec.iter() {
match self.node_role(uuid) { match self.node_role(uuid) {
Some(role) if role.capacity != None => result.push(*uuid), Some(role) if role.capacity.is_some() => result.push(*uuid),
_ => (), _ => (),
} }
} }
@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
Ok(total_capacity) Ok(total_capacity)
} }
/// Returns the effective value of the zone_redundancy parameter
fn effective_zone_redundancy(&self) -> usize {
match self.parameters.zone_redundancy {
ZoneRedundancy::AtLeast(v) => v,
ZoneRedundancy::Maximum => {
let n_zones = self
.roles
.items()
.iter()
.filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str()))
.collect::<HashSet<&str>>()
.len();
std::cmp::min(n_zones, self.replication_factor)
}
}
}
/// Check a cluster layout for internal consistency /// Check a cluster layout for internal consistency
/// (assignment, roles, parameters, partition size) /// (assignment, roles, parameters, partition size)
/// returns true if consistent, false if error /// returns true if consistent, false if error
@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
} }
// Check that every partition is associated to distinct nodes // Check that every partition is associated to distinct nodes
let zone_redundancy = self.effective_zone_redundancy();
let rf = self.replication_factor; let rf = self.replication_factor;
for p in 0..(1 << PARTITION_BITS) { for p in 0..(1 << PARTITION_BITS) {
let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec();
@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show`
.expect("Zone not found.") .expect("Zone not found.")
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let redundancy = self.parameters.zone_redundancy; if zones_of_p.iter().unique().count() < zone_redundancy {
if zones_of_p.iter().unique().count() < redundancy {
return Err(format!( return Err(format!(
"nodes of partition are in less than {} distinct zones", "nodes of partition are in less than {} distinct zones",
redundancy zone_redundancy
)); ));
} }
} }
@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
// algorithm. // algorithm.
let cl2 = self.clone(); let cl2 = self.clone();
let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap();
match cl2.compute_optimal_partition_size(&zone_to_id) { match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) {
Ok(s) if s != self.partition_size => { Ok(s) if s != self.partition_size => {
return Err(format!( return Err(format!(
"partition_size ({}) is different than optimal value ({})", "partition_size ({}) is different than optimal value ({})",
@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show`
} }
} }
// ====================================================================================
// Implementation of the ClusterLayout methods related to the assignment algorithm. // Implementation of the ClusterLayout methods related to the assignment algorithm.
impl ClusterLayout { impl ClusterLayout {
/// This function calculates a new partition-to-node assignment. /// This function calculates a new partition-to-node assignment.
@ -549,13 +594,15 @@ impl ClusterLayout {
// changes in the layout. We retrieve the old_assignment reframed with new ids // changes in the layout. We retrieve the old_assignment reframed with new ids
let old_assignment_opt = self.update_node_id_vec()?; let old_assignment_opt = self.update_node_id_vec()?;
let zone_redundancy = self.effective_zone_redundancy();
let mut msg = Message::new(); let mut msg = Message::new();
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
msg.push("".into()); msg.push("".into());
msg.push(format!( msg.push(format!(
"Partitions are \ "Partitions are \
replicated {} times on at least {} distinct zones.", replicated {} times on at least {} distinct zones.",
self.replication_factor, self.parameters.zone_redundancy self.replication_factor, zone_redundancy
)); ));
// We generate for once numerical ids for the zones of non gateway nodes, // We generate for once numerical ids for the zones of non gateway nodes,
@ -570,12 +617,12 @@ impl ClusterLayout {
nb_nongateway_nodes, self.replication_factor nb_nongateway_nodes, self.replication_factor
))); )));
} }
if id_to_zone.len() < self.parameters.zone_redundancy { if id_to_zone.len() < zone_redundancy {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"The number of zones with non-gateway \ "The number of zones with non-gateway \
nodes ({}) is smaller than the redundancy parameter ({})", nodes ({}) is smaller than the redundancy parameter ({})",
id_to_zone.len(), id_to_zone.len(),
self.parameters.zone_redundancy zone_redundancy
))); )));
} }
@ -583,18 +630,18 @@ impl ClusterLayout {
// Capacities should be given in a unit so that partition size is at least 100. // Capacities should be given in a unit so that partition size is at least 100.
// In this case, integer rounding plays a marginal role in the percentages of // In this case, integer rounding plays a marginal role in the percentages of
// optimality. // optimality.
let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?;
if old_assignment_opt != None { msg.push("".into());
if old_assignment_opt.is_some() {
msg.push(format!( msg.push(format!(
"Optimal size of a partition: {} (was {} in the previous layout).", "Optimal partition size: {} ({} in previous layout)",
ByteSize::b(partition_size).to_string_as(false), ByteSize::b(partition_size).to_string_as(false),
ByteSize::b(self.partition_size).to_string_as(false) ByteSize::b(self.partition_size).to_string_as(false)
)); ));
} else { } else {
msg.push(format!( msg.push(format!(
"Given the replication and redundancy constraints, the \ "Optimal partition size: {}",
optimal size of a partition is {}.",
ByteSize::b(partition_size).to_string_as(false) ByteSize::b(partition_size).to_string_as(false)
)); ));
} }
@ -610,7 +657,8 @@ impl ClusterLayout {
// We compute a first flow/assignment that is heuristically close to the previous // We compute a first flow/assignment that is heuristically close to the previous
// assignment // assignment
let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?; let mut gflow =
self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?;
if let Some(assoc) = &old_assignment_opt { if let Some(assoc) = &old_assignment_opt {
// We minimize the distance to the previous assignment. // We minimize the distance to the previous assignment.
self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?;
@ -618,7 +666,6 @@ impl ClusterLayout {
// We display statistics of the computation // We display statistics of the computation
msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
msg.push("".to_string());
// We update the layout structure // We update the layout structure
self.update_ring_from_flow(id_to_zone.len(), &gflow)?; self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
@ -645,7 +692,7 @@ impl ClusterLayout {
.roles .roles
.items() .items()
.iter() .iter()
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some()))
.map(|(k, _, _)| *k) .map(|(k, _, _)| *k)
.collect(); .collect();
@ -661,7 +708,7 @@ impl ClusterLayout {
.roles .roles
.items() .items()
.iter() .iter()
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none()))
.map(|(k, _, _)| *k) .map(|(k, _, _)| *k)
.collect(); .collect();
@ -723,7 +770,7 @@ impl ClusterLayout {
for uuid in self.nongateway_nodes().iter() { for uuid in self.nongateway_nodes().iter() {
let r = self.node_role(uuid).unwrap(); let r = self.node_role(uuid).unwrap();
if !zone_to_id.contains_key(&r.zone) && r.capacity != None { if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() {
zone_to_id.insert(r.zone.clone(), id_to_zone.len()); zone_to_id.insert(r.zone.clone(), id_to_zone.len());
id_to_zone.push(r.zone.clone()); id_to_zone.push(r.zone.clone());
} }
@ -736,9 +783,10 @@ impl ClusterLayout {
fn compute_optimal_partition_size( fn compute_optimal_partition_size(
&self, &self,
zone_to_id: &HashMap<String, usize>, zone_to_id: &HashMap<String, usize>,
zone_redundancy: usize,
) -> Result<u64, Error> { ) -> Result<u64, Error> {
let empty_set = HashSet::<(usize, usize)>::new(); let empty_set = HashSet::<(usize, usize)>::new();
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?;
g.compute_maximal_flow()?; g.compute_maximal_flow()?;
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
return Err(Error::Message( return Err(Error::Message(
@ -751,7 +799,12 @@ impl ClusterLayout {
let mut s_down = 1; let mut s_down = 1;
let mut s_up = self.get_total_capacity()?; let mut s_up = self.get_total_capacity()?;
while s_down + 1 < s_up { while s_down + 1 < s_up {
g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g = self.generate_flow_graph(
(s_down + s_up) / 2,
zone_to_id,
&empty_set,
zone_redundancy,
)?;
g.compute_maximal_flow()?; g.compute_maximal_flow()?;
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
s_up = (s_down + s_up) / 2; s_up = (s_down + s_up) / 2;
@ -790,18 +843,18 @@ impl ClusterLayout {
partition_size: u64, partition_size: u64,
zone_to_id: &HashMap<String, usize>, zone_to_id: &HashMap<String, usize>,
exclude_assoc: &HashSet<(usize, usize)>, exclude_assoc: &HashSet<(usize, usize)>,
zone_redundancy: usize,
) -> Result<Graph<FlowEdge>, Error> { ) -> Result<Graph<FlowEdge>, Error> {
let vertices = let vertices =
ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
let mut g = Graph::<FlowEdge>::new(&vertices); let mut g = Graph::<FlowEdge>::new(&vertices);
let nb_zones = zone_to_id.len(); let nb_zones = zone_to_id.len();
let redundancy = self.parameters.zone_redundancy;
for p in 0..NB_PARTITIONS { for p in 0..NB_PARTITIONS {
g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?;
g.add_edge( g.add_edge(
Vertex::Source, Vertex::Source,
Vertex::Pdown(p), Vertex::Pdown(p),
(self.replication_factor - redundancy) as u64, (self.replication_factor - zone_redundancy) as u64,
)?; )?;
for z in 0..nb_zones { for z in 0..nb_zones {
g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?;
@ -830,6 +883,7 @@ impl ClusterLayout {
&self, &self,
zone_to_id: &HashMap<String, usize>, zone_to_id: &HashMap<String, usize>,
prev_assign_opt: &Option<Vec<Vec<usize>>>, prev_assign_opt: &Option<Vec<Vec<usize>>>,
zone_redundancy: usize,
) -> Result<Graph<FlowEdge>, Error> { ) -> Result<Graph<FlowEdge>, Error> {
// We list the (partition,node) associations that are not used in the // We list the (partition,node) associations that are not used in the
// previous assignment // previous assignment
@ -847,7 +901,12 @@ impl ClusterLayout {
} }
// We compute the best flow using only the edges used in the previous assignment // We compute the best flow using only the edges used in the previous assignment
let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; let mut g = self.generate_flow_graph(
self.partition_size,
zone_to_id,
&exclude_edge,
zone_redundancy,
)?;
g.compute_maximal_flow()?; g.compute_maximal_flow()?;
// We add the excluded edges and compute the maximal flow with the full graph. // We add the excluded edges and compute the maximal flow with the full graph.
@ -931,29 +990,33 @@ impl ClusterLayout {
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
let total_cap = self.get_total_capacity()?; let total_cap = self.get_total_capacity()?;
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
msg.push("".into());
msg.push(format!( msg.push(format!(
"Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", "Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
ByteSize::b(used_cap).to_string_as(false), ByteSize::b(used_cap).to_string_as(false),
ByteSize::b(total_cap).to_string_as(false), ByteSize::b(total_cap).to_string_as(false),
percent_cap percent_cap
)); ));
msg.push("".into());
msg.push(
"If the percentage is too low, it might be that the \
replication/redundancy constraints force the use of nodes/zones with small \
storage capacities. \
You might want to rebalance the storage capacities or relax the constraints. \
See the detailed statistics below and look for saturated nodes/zones."
.into(),
);
msg.push(format!( msg.push(format!(
"Recall that because of the replication factor, the actual available \ "Effective capacity (replication factor {}): {}",
storage capacity is {} / {} = {}.",
ByteSize::b(used_cap).to_string_as(false),
self.replication_factor, self.replication_factor,
ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false)
)); ));
if percent_cap < 80. {
msg.push("".into());
msg.push(
"If the percentage is too low, it might be that the \
cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
storage capacities."
.into(),
);
msg.push(
"You might want to move storage capacity between zones or relax the redundancy constraint."
.into(),
);
msg.push(
"See the detailed statistics below and look for saturated nodes/zones.".into(),
);
}
// We define and fill in the following tables // We define and fill in the following tables
let storing_nodes = self.nongateway_nodes(); let storing_nodes = self.nongateway_nodes();
@ -992,25 +1055,25 @@ impl ClusterLayout {
} }
} }
if *prev_assign_opt == None { if prev_assign_opt.is_none() {
new_partitions = stored_partitions.clone(); new_partitions = stored_partitions.clone();
new_partitions_zone = stored_partitions_zone.clone(); //new_partitions_zone = stored_partitions_zone.clone();
} }
// We display the statistics // We display the statistics
msg.push("".into()); msg.push("".into());
if *prev_assign_opt != None { if prev_assign_opt.is_some() {
let total_new_partitions: usize = new_partitions.iter().sum(); let total_new_partitions: usize = new_partitions.iter().sum();
msg.push(format!( msg.push(format!(
"A total of {} new copies of partitions need to be \ "A total of {} new copies of partitions need to be \
transferred.", transferred.",
total_new_partitions total_new_partitions
)); ));
msg.push("".into());
} }
msg.push("".into());
msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into());
let mut table = vec![];
for z in 0..id_to_zone.len() { for z in 0..id_to_zone.len() {
let mut nodes_of_z = Vec::<usize>::new(); let mut nodes_of_z = Vec::<usize>::new();
for n in 0..storing_nodes.len() { for n in 0..storing_nodes.len() {
@ -1020,15 +1083,9 @@ impl ClusterLayout {
} }
let replicated_partitions: usize = let replicated_partitions: usize =
nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
msg.push("".into()); table.push(format!(
"{}\tTags\tPartitions\tCapacity\tUsable capacity",
msg.push(format!( id_to_zone[z]
"Zone {}: {} distinct partitions stored ({} new, \
{} partition copies) ",
id_to_zone[z],
stored_partitions_zone[z],
new_partitions_zone[z],
replicated_partitions
)); ));
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
@ -1037,33 +1094,35 @@ impl ClusterLayout {
total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
} }
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
msg.push(format!(
" Usable capacity / Total capacity: {} / {} ({:.1}%).",
ByteSize::b(available_cap_z).to_string_as(false),
ByteSize::b(total_cap_z).to_string_as(false),
percent_cap_z
));
for n in nodes_of_z.iter() { for n in nodes_of_z.iter() {
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
let tags_n = (self let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
.node_role(&self.node_id_vec[*n]) table.push(format!(
.ok_or("Node not found."))? " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
.tags_string();
msg.push(format!(
" Node {:?}: {} partitions ({} new) ; \
usable/total capacity: {} / {} ({:.1}%) ; tags:{}",
self.node_id_vec[*n], self.node_id_vec[*n],
tags_n,
stored_partitions[*n], stored_partitions[*n],
new_partitions[*n], new_partitions[*n],
ByteSize::b(available_cap_n).to_string_as(false),
ByteSize::b(total_cap_n).to_string_as(false), ByteSize::b(total_cap_n).to_string_as(false),
ByteSize::b(available_cap_n).to_string_as(false),
(available_cap_n as f32) / (total_cap_n as f32) * 100.0, (available_cap_n as f32) / (total_cap_n as f32) * 100.0,
tags_n
)); ));
} }
table.push(format!(
" TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)",
replicated_partitions,
stored_partitions_zone[z],
//new_partitions_zone[z],
ByteSize::b(total_cap_z).to_string_as(false),
ByteSize::b(available_cap_z).to_string_as(false),
percent_cap_z
));
table.push("".into());
} }
msg.push(format_table::format_table_to_string(table));
Ok(msg) Ok(msg)
} }
@ -1125,7 +1184,7 @@ mod tests {
let mut curr_zone = 0; let mut curr_zone = 0;
let redundancy = cl.parameters.zone_redundancy; let redundancy = cl.effective_zone_redundancy();
for replic in 0..cl.replication_factor { for replic in 0..cl.replication_factor {
for p in 0..NB_PARTITIONS { for p in 0..NB_PARTITIONS {
@ -1177,8 +1236,9 @@ mod tests {
); );
cl.staging_roles.merge(&update); cl.staging_roles.merge(&update);
} }
cl.staging_parameters cl.staging_parameters.update(LayoutParameters {
.update(LayoutParameters { zone_redundancy }); zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
});
cl.staging_hash = cl.calculate_staging_hash(); cl.staging_hash = cl.calculate_staging_hash();
} }

View file

@ -668,7 +668,7 @@ impl System {
let prev_layout_check = layout.check().is_ok(); let prev_layout_check = layout.check().is_ok();
if layout.merge(adv) { if layout.merge(adv) {
if prev_layout_check && !layout.check().is_ok() { if prev_layout_check && layout.check().is_err() {
error!("New cluster layout is invalid, discarding."); error!("New cluster layout is invalid, discarding.");
return Err(Error::Message( return Err(Error::Message(
"New cluster layout is invalid, discarding.".into(), "New cluster layout is invalid, discarding.".into(),
@ -724,7 +724,7 @@ impl System {
async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) { async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
while !*stop_signal.borrow() { while !*stop_signal.borrow() {
let not_configured = !self.ring.borrow().layout.check().is_ok(); let not_configured = self.ring.borrow().layout.check().is_err();
let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let expected_n_nodes = self.ring.borrow().layout.num_nodes();
let bad_peers = self let bad_peers = self