Garage v0.9 #473

Merged
lx merged 175 commits from next into main 2023-10-10 13:28:29 +00:00
9 changed files with 438 additions and 180 deletions
Showing only changes of commit aa7eadc799 - Show all commits

1
Cargo.lock generated
View file

@ -1370,6 +1370,7 @@ dependencies = [
"bytes",
"bytesize",
"err-derive",
"format_table",
"futures",
"futures-util",
"garage_db",

View file

@ -7,13 +7,13 @@ args@{
"garage_db/default"
"garage_util/default"
"garage_rpc/default"
"format_table/default"
"garage_table/default"
"garage_block/default"
"garage_model/default"
"garage_api/default"
"garage_web/default"
"garage/default"
"format_table/default"
"k2v-client/default"
],
rustPackages,
@ -33,7 +33,7 @@ args@{
ignoreLockHash,
}:
let
nixifiedLockHash = "7bef0004fa84feec502c75d50632d54202c272d56d2549fc09e2a356141685bb";
nixifiedLockHash = "5df33eefe787762bf831e92c723c153faf8d5910332dcdf2fd941fe03be59936";
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
lockHashIgnored = if ignoreLockHash
@ -60,13 +60,13 @@ in
garage_db = rustPackages.unknown.garage_db."0.8.4";
garage_util = rustPackages.unknown.garage_util."0.8.4";
garage_rpc = rustPackages.unknown.garage_rpc."0.8.4";
format_table = rustPackages.unknown.format_table."0.1.1";
garage_table = rustPackages.unknown.garage_table."0.8.4";
garage_block = rustPackages.unknown.garage_block."0.8.4";
garage_model = rustPackages.unknown.garage_model."0.8.4";
garage_api = rustPackages.unknown.garage_api."0.8.4";
garage_web = rustPackages.unknown.garage_web."0.8.4";
garage = rustPackages.unknown.garage."0.8.4";
format_table = rustPackages.unknown.format_table."0.1.1";
k2v-client = rustPackages.unknown.k2v-client."0.0.4";
};
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
@ -1954,6 +1954,7 @@ in
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out;
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out;

View file

@ -9,18 +9,30 @@ a certain capacity, or a gateway node that does not store data and is only
used as an API entry point for faster cluster access.
An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page.
In Garage, all of the data that can be stored in a given cluster is divided
into slices which we call *partitions*. Each partition is stored by
one or several nodes in the cluster
(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)).
The layout determines the correspondence between these partition,
which exist on a logical level, and actual storage nodes.
## How cluster layouts work in Garage
In Garage, a cluster layout is composed of the following components:
A cluster layout is composed of the following components:
- a table of roles assigned to nodes
- a table of roles assigned to nodes, defined by the user
- an optimal assignation of partitions to nodes, computed by an algorithm that is ran once when calling `garage layout apply` or the ApplyClusterLayout API endpoint
- a version number
Garage nodes will always use the cluster layout with the highest version number.
Garage nodes also maintain and synchronize between them a set of proposed role
changes that haven't yet been applied. These changes will be applied (or
canceled) in the next version of the layout
canceled) in the next version of the layout.
All operations on the layout can be realized using the `garage` CLI or using the
[administration API endpoint](@/documentation/reference-manual/admin-api.md).
We give here a description of CLI commands, the admin API semantics are very similar.
The following commands insert modifications to the set of proposed role changes
for the next layout version (but they do not create the new layout immediately):
@ -51,7 +63,7 @@ commands will fail otherwise.
## Warnings about Garage cluster layout management
**Warning: never make several calls to `garage layout apply` or `garage layout
**⚠️ Never make several calls to `garage layout apply` or `garage layout
revert` with the same value of the `--version` flag. Doing so can lead to the
creation of several different layouts with the same version number, in which
case your Garage cluster will become inconsistent until fixed.** If a call to
@ -65,13 +77,198 @@ shell, you shouldn't have much issues as long as you run commands one after
the other and take care of checking the output of `garage layout show`
before applying any changes.
If you are using the `garage` CLI to script layout changes, follow the following recommendations:
If you are using the `garage` CLI or the admin API to script layout changes,
follow the following recommendations:
- Make all of your `garage` CLI calls to the same RPC host. Do not use the
`garage` CLI to connect to individual nodes to send them each a piece of the
layout changes you are making, as the changes propagate asynchronously
between nodes and might not all be taken into account at the time when the
new layout is applied.
- If using the CLI, make all of your `garage` CLI calls to the same RPC host.
If using the admin API, make all of your API calls to the same Garage node. Do
not connect to individual nodes to send them each a piece of the layout changes
you are making, as the changes propagate asynchronously between nodes and might
not all be taken into account at the time when the new layout is applied.
- **Only call `garage layout apply` once**, and call it **strictly after** all
of the `layout assign` and `layout remove` commands have returned.
- **Only call `garage layout apply`/ApplyClusterLayout once**, and call it
**strictly after** all of the `layout assign` and `layout remove`
commands/UpdateClusterLayout API calls have returned.
## Understanding unexpected layout calculations
When adding, removing or modifying nodes in a cluster layout, sometimes
unexpected assigntations of partitions to node can occur. These assignations
are in fact normal and logical, given the objectives of the algorihtm. Indeed,
**the layout algorithm prioritizes moving less data between nodes over the fact
of achieving equal distribution of load. It also tries to use all links between
pairs of nodes in equal proportions when moving data.** This section presents
two examples and illustrates how one can control Garage's behavior to obtain
the desired results.
### Example 1
In this example, a cluster is originally composed of 3 nodes in 3 different
zones (data centers). The three nodes are of equal capacity, therefore they
are all fully exploited and all store a copy of all of the data in the cluster.
Then, a fourth node of the same size is added in the datacenter `dc1`.
As illustrated by the following, **Garage will by default not store any data on the new node**:
```
$ garage layout show
==== CURRENT CLUSTER LAYOUT ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
Zone redundancy: maximum
Current cluster layout version: 6
==== STAGED ROLE CHANGES ====
ID Tags Zone Capacity
a11c7cf18af29737 node4 dc1 1000.0 MB
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
a11c7cf18af29737 node4 dc1 1000.0 MB 0 B (0.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
Zone redundancy: maximum
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
Partitions are replicated 3 times on at least 3 distinct zones.
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
Effective capacity (replication factor 3): 1000.0 MB
A total of 0 new copies of partitions need to be transferred.
dc1 Tags Partitions Capacity Usable capacity
b10c110e4e854e5a node1 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
a11c7cf18af29737 node4 0 (0 new) 1000.0 MB 0 B (0.0%)
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
dc2 Tags Partitions Capacity Usable capacity
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
dc3 Tags Partitions Capacity Usable capacity
62b218d848e86a64 node3 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
```
While unexpected, this is logical because of the following facts:
- storing some data on the new node does not help increase the total quantity
of data that can be stored on the cluster, as the two other zones (`dc2` and
`dc3`) still need to store a full copy of everything, and their capacity is
still the same;
- there is therefore no need to move any data on the new node as this would be pointless;
- moving data to the new node has a cost which the algorithm decides to not pay if not necessary.
This distribution of data can however not be what the administrator wanted: if
they added a new node to `dc1`, it might be because the existing node is too
slow, and they wish to divide its load by half. In that case, what they need to
do to force Garage to distribute the data between the two nodes is to attribute
only half of the capacity to each node in `dc1` (in our example, 500M instead of 1G).
In that case, Garage would determine that to be able to store 1G in total, it
would need to store 500M on the old node and 500M on the added one.
### Example 2
The following example is a slightly different scenario, where `dc1` had two
nodes that were used at 50%, and `dc2` and `dc3` each have one node that is
100% used. All node capacities are the same.
Then, a node from `dc1` is moved into `dc3`. One could expect that the roles of
`dc1` and `dc3` would simply be swapped: the remaining node in `dc1` would be
used at 100%, and the two nodes now in `dc3` would be used at 50%. Instead,
this happens:
```
==== CURRENT CLUSTER LAYOUT ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 500.0 MB (50.0%)
a11c7cf18af29737 node4 dc1 1000.0 MB 500.0 MB (50.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
Zone redundancy: maximum
Current cluster layout version: 8
==== STAGED ROLE CHANGES ====
ID Tags Zone Capacity
a11c7cf18af29737 node4 dc3 1000.0 MB
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
ID Tags Zone Capacity Usable capacity
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
62b218d848e86a64 node3 dc3 1000.0 MB 753.9 MB (75.4%)
a11c7cf18af29737 node4 dc3 1000.0 MB 246.1 MB (24.6%)
Zone redundancy: maximum
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
Partitions are replicated 3 times on at least 3 distinct zones.
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
Effective capacity (replication factor 3): 1000.0 MB
A total of 128 new copies of partitions need to be transferred.
dc1 Tags Partitions Capacity Usable capacity
b10c110e4e854e5a node1 256 (128 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
dc2 Tags Partitions Capacity Usable capacity
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
dc3 Tags Partitions Capacity Usable capacity
62b218d848e86a64 node3 193 (0 new) 1000.0 MB 753.9 MB (75.4%)
a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%)
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
```
As we can see, the node that was moved to `dc3` (node4) is only used at 25% (approximatively),
whereas the node that was already in `dc3` (node3) is used at 75%.
This can be explained by the following:
- node1 will now be the only node remaining in `dc1`, thus it has to store all
of the data in the cluster. Since it was storing only half of it before, it has
to retrieve the other half from other nodes in the cluster.
- The data which it does not have is entirely stored by the other node that was
in `dc1` and that is now in `dc3` (node4). There is also a copy of it on node2
and node3 since both these nodes have a copy of everything.
- node3 and node4 are the two nodes that will now be in a datacenter that is
under-utilized (`dc3`), this means that those are the two candidates from which
data can be removed to be moved to node1.
- Garage will move data in equal proportions from all possible sources, in this
case it means that it will tranfer 25% of the entire data set from node3 to
node1 and another 25% from node4 to node1.
This explains why node3 ends with 75% utilization (100% from before minus 25%
that is moved to node1), and node4 ends with 25% (50% from before minus 25%
that is moved to node1).
This illustrates the second principle of the layout computation: **if there is
a choice in moving data out of some nodes, then all links between pairs of
nodes are used in equal proportions** (this is approximately true, there is
randomness in the algorihtm to achieve this so there might be some small
fluctuations, as we see above).

View file

@ -174,16 +174,12 @@ pub async fn cmd_show_layout(
let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== CURRENT CLUSTER LAYOUT ====");
if !print_cluster_layout(&layout) {
println!("No nodes currently have a role in the cluster.");
println!("See `garage status` to view available nodes.");
}
print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
println!();
println!("Current cluster layout version: {}", layout.version);
let has_role_changes = print_staging_role_changes(&layout);
let has_param_changes = print_staging_parameters_changes(&layout);
if has_role_changes || has_param_changes {
if has_role_changes {
let v = layout.version;
let res_apply = layout.apply_staged_changes(Some(v + 1));
@ -193,9 +189,7 @@ pub async fn cmd_show_layout(
Ok((layout, msg)) => {
println!();
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
if !print_cluster_layout(&layout) {
println!("No nodes have a role in the new layout.");
}
print_cluster_layout(&layout, "No nodes have a role in the new layout.");
println!();
for line in msg.iter() {
@ -267,28 +261,35 @@ pub async fn cmd_config_layout(
let mut did_something = false;
match config_opt.redundancy {
None => (),
Some(r) => {
if r > layout.replication_factor {
println!(
"The zone redundancy must be smaller or equal to the \
replication factor ({}).",
layout.replication_factor
);
} else if r < 1 {
println!("The zone redundancy must be at least 1.");
} else {
layout
.staging_parameters
.update(LayoutParameters { zone_redundancy: r });
println!("The new zone redundancy has been saved ({}).", r);
Some(r_str) => {
let r = r_str
.parse::<ZoneRedundancy>()
.ok_or_message("invalid zone redundancy value")?;
if let ZoneRedundancy::AtLeast(r_int) = r {
if r_int > layout.replication_factor {
return Err(Error::Message(format!(
"The zone redundancy must be smaller or equal to the \
replication factor ({}).",
layout.replication_factor
)));
} else if r_int < 1 {
return Err(Error::Message(
"The zone redundancy must be at least 1.".into(),
));
}
}
layout
.staging_parameters
.update(LayoutParameters { zone_redundancy: r });
println!("The zone redundancy parameter has been set to '{}'.", r);
did_something = true;
}
}
if !did_something {
return Err(Error::Message(
"Please specify an action for `garage layout config` to do".into(),
"Please specify an action for `garage layout config`".into(),
));
}
@ -326,7 +327,7 @@ pub async fn send_layout(
Ok(())
}
pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
for (id, _, role) in layout.roles.items().iter() {
let role = match &role.0 {
@ -356,61 +357,54 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
));
};
}
println!();
println!("Parameters of the layout computation:");
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
println!();
if table.len() == 1 {
false
} else {
if table.len() > 1 {
format_table(table);
true
}
}
pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool {
let has_changes = *layout.staging_parameters.get() != layout.parameters;
if has_changes {
println!();
println!("==== NEW LAYOUT PARAMETERS ====");
println!(
"Zone redundancy: {}",
layout.staging_parameters.get().zone_redundancy
);
println!();
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
} else {
println!("{}", empty_msg);
}
has_changes
}
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
let has_changes = layout
let has_role_changes = layout
.staging_roles
.items()
.iter()
.any(|(k, _, v)| layout.roles.get(k) != Some(v));
let has_layout_changes = *layout.staging_parameters.get() != layout.parameters;
if has_changes {
if has_role_changes || has_layout_changes {
println!();
println!("==== STAGED ROLE CHANGES ====");
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
for (id, _, role) in layout.staging_roles.items().iter() {
if layout.roles.get(id) == Some(role) {
continue;
}
if let Some(role) = &role.0 {
let tags = role.tags.join(",");
table.push(format!(
"{:?}\t{}\t{}\t{}",
id,
tags,
role.zone,
role.capacity_string()
));
} else {
table.push(format!("{:?}\tREMOVED", id));
if has_role_changes {
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
for (id, _, role) in layout.staging_roles.items().iter() {
if layout.roles.get(id) == Some(role) {
continue;
}
if let Some(role) = &role.0 {
let tags = role.tags.join(",");
table.push(format!(
"{:?}\t{}\t{}\t{}",
id,
tags,
role.zone,
role.capacity_string()
));
} else {
table.push(format!("{:?}\tREMOVED", id));
}
}
format_table(table);
println!();
}
if has_layout_changes {
println!(
"Zone redundancy: {}",
layout.staging_parameters.get().zone_redundancy
);
}
format_table(table);
true
} else {
false

View file

@ -143,9 +143,9 @@ pub struct RemoveRoleOpt {
#[derive(StructOpt, Debug)]
pub struct ConfigLayoutOpt {
/// Zone redundancy parameter
/// Zone redundancy parameter ('none'/'max' or integer)
#[structopt(short = "r", long = "redundancy")]
pub(crate) redundancy: Option<usize>,
pub(crate) redundancy: Option<String>,
}
#[derive(StructOpt, Debug)]

View file

@ -14,6 +14,7 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
format_table.workspace = true
garage_db.workspace = true
garage_util.workspace = true

View file

@ -1,7 +1,7 @@
//! This module deals with graph algorithms.
//! It is used in layout.rs to build the partition to node assignment.
use rand::prelude::SliceRandom;
use rand::prelude::{SeedableRng, SliceRandom};
use std::cmp::{max, min};
use std::collections::HashMap;
use std::collections::VecDeque;
@ -143,7 +143,11 @@ impl Graph<FlowEdge> {
/// This function shuffles the order of the edge lists. It keeps the ids of the
/// reversed edges consistent.
fn shuffle_edges(&mut self) {
let mut rng = rand::thread_rng();
// We use deterministic randomness so that the layout calculation algorihtm
// will output the same thing every time it is run. This way, the results
// pre-calculated in `garage layout show` will match exactly those used
// in practice with `garage layout apply`
let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]);
for i in 0..self.graph.len() {
self.graph[i].shuffle(&mut rng);
// We need to update the ids of the reverse edges.
@ -189,7 +193,7 @@ impl Graph<FlowEdge> {
let mut fifo = VecDeque::new();
fifo.push_back((idsource, 0));
while let Some((id, lvl)) = fifo.pop_front() {
if level[id] == None {
if level[id].is_none() {
// it means id has not yet been reached
level[id] = Some(lvl);
for edge in self.graph[id].iter() {
@ -199,7 +203,7 @@ impl Graph<FlowEdge> {
}
}
}
if level[idsink] == None {
if level[idsink].is_none() {
// There is no residual flow
break;
}
@ -383,7 +387,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
for t in 0..forest.len() {
let mut id = t;
// while we are on a valid undiscovered node
while time_of_discovery[id] == None {
while time_of_discovery[id].is_none() {
time_of_discovery[id] = Some(t);
if let Some(i) = forest[id] {
id = i;
@ -391,7 +395,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
break;
}
}
if forest[id] != None && time_of_discovery[id] == Some(t) {
if forest[id].is_some() && time_of_discovery[id] == Some(t) {
// We discovered an id that we explored at this iteration t.
// It means we are on a cycle
let mut cy = vec![id; 1];

View file

@ -1,6 +1,7 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
use bytesize::ByteSize;
use itertools::Itertools;
@ -115,7 +116,16 @@ mod v09 {
/// algorithm. It is stored as a Crdt.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct LayoutParameters {
pub zone_redundancy: usize,
pub zone_redundancy: ZoneRedundancy,
}
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
/// of each partition on at least that number of different zones.
/// Otherwise, copies will be stored on the maximum possible number of zones.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub enum ZoneRedundancy {
AtLeast(usize),
Maximum,
}
impl garage_util::migrate::Migrate for ClusterLayout {
@ -125,7 +135,6 @@ mod v09 {
fn migrate(previous: Self::Previous) -> Self {
use itertools::Itertools;
use std::collections::HashSet;
// In the old layout, capacities are in an arbitrary unit,
// but in the new layout they are in bytes.
@ -152,17 +161,10 @@ mod v09 {
.min()
.unwrap_or(0);
// Determine zone redundancy parameter
let zone_redundancy = std::cmp::min(
previous.replication_factor,
roles
.items()
.iter()
.filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str()))
.collect::<HashSet<&str>>()
.len(),
);
let parameters = LayoutParameters { zone_redundancy };
// By default, zone_redundancy is maximum possible value
let parameters = LayoutParameters {
zone_redundancy: ZoneRedundancy::Maximum,
};
let mut res = Self {
version: previous.version,
@ -193,7 +195,7 @@ mod v09 {
..
})) = role
{
*cap = *cap * mul;
*cap *= mul;
}
new_roles.merge_raw(node, *ts, &role);
}
@ -224,15 +226,39 @@ impl NodeRole {
}
}
impl fmt::Display for ZoneRedundancy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ZoneRedundancy::Maximum => write!(f, "maximum"),
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
}
}
}
impl core::str::FromStr for ZoneRedundancy {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
x => {
let v = x
.parse::<usize>()
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
Ok(ZoneRedundancy::AtLeast(v))
}
}
}
}
// Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
impl ClusterLayout {
pub fn new(replication_factor: usize) -> Self {
// We set the default zone redundancy to be equal to the replication factor,
// i.e. as strict as possible.
// We set the default zone redundancy to be Maximum, meaning that the maximum
// possible value will be used depending on the cluster topology
let parameters = LayoutParameters {
zone_redundancy: replication_factor,
zone_redundancy: ZoneRedundancy::Maximum,
};
let staging_parameters = Lww::<LayoutParameters>::new(parameters.clone());
let staging_parameters = Lww::<LayoutParameters>::new(parameters);
let empty_lwwmap = LwwMap::new();
@ -296,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
self.roles.merge(&self.staging_roles);
self.roles.retain(|(_, _, v)| v.0.is_some());
self.parameters = self.staging_parameters.get().clone();
self.parameters = *self.staging_parameters.get();
self.staging_roles.clear();
self.staging_hash = self.calculate_staging_hash();
@ -325,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
}
self.staging_roles.clear();
self.staging_parameters.update(self.parameters.clone());
self.staging_parameters.update(self.parameters);
self.staging_hash = self.calculate_staging_hash();
self.version += 1;
@ -356,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
let mut result = Vec::<Uuid>::new();
for uuid in self.node_id_vec.iter() {
match self.node_role(uuid) {
Some(role) if role.capacity != None => result.push(*uuid),
Some(role) if role.capacity.is_some() => result.push(*uuid),
_ => (),
}
}
@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
Ok(total_capacity)
}
/// Returns the effective value of the zone_redundancy parameter
fn effective_zone_redundancy(&self) -> usize {
match self.parameters.zone_redundancy {
ZoneRedundancy::AtLeast(v) => v,
ZoneRedundancy::Maximum => {
let n_zones = self
.roles
.items()
.iter()
.filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str()))
.collect::<HashSet<&str>>()
.len();
std::cmp::min(n_zones, self.replication_factor)
}
}
}
/// Check a cluster layout for internal consistency
/// (assignment, roles, parameters, partition size)
/// returns true if consistent, false if error
@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
}
// Check that every partition is associated to distinct nodes
let zone_redundancy = self.effective_zone_redundancy();
let rf = self.replication_factor;
for p in 0..(1 << PARTITION_BITS) {
let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec();
@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show`
.expect("Zone not found.")
})
.collect::<Vec<_>>();
let redundancy = self.parameters.zone_redundancy;
if zones_of_p.iter().unique().count() < redundancy {
if zones_of_p.iter().unique().count() < zone_redundancy {
return Err(format!(
"nodes of partition are in less than {} distinct zones",
redundancy
zone_redundancy
));
}
}
@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
// algorithm.
let cl2 = self.clone();
let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap();
match cl2.compute_optimal_partition_size(&zone_to_id) {
match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) {
Ok(s) if s != self.partition_size => {
return Err(format!(
"partition_size ({}) is different than optimal value ({})",
@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show`
}
}
// ====================================================================================
// Implementation of the ClusterLayout methods related to the assignment algorithm.
impl ClusterLayout {
/// This function calculates a new partition-to-node assignment.
@ -549,13 +594,15 @@ impl ClusterLayout {
// changes in the layout. We retrieve the old_assignment reframed with new ids
let old_assignment_opt = self.update_node_id_vec()?;
let zone_redundancy = self.effective_zone_redundancy();
let mut msg = Message::new();
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
msg.push("".into());
msg.push(format!(
"Partitions are \
replicated {} times on at least {} distinct zones.",
self.replication_factor, self.parameters.zone_redundancy
self.replication_factor, zone_redundancy
));
// We generate for once numerical ids for the zones of non gateway nodes,
@ -570,12 +617,12 @@ impl ClusterLayout {
nb_nongateway_nodes, self.replication_factor
)));
}
if id_to_zone.len() < self.parameters.zone_redundancy {
if id_to_zone.len() < zone_redundancy {
return Err(Error::Message(format!(
"The number of zones with non-gateway \
nodes ({}) is smaller than the redundancy parameter ({})",
id_to_zone.len(),
self.parameters.zone_redundancy
zone_redundancy
)));
}
@ -583,18 +630,18 @@ impl ClusterLayout {
// Capacities should be given in a unit so that partition size is at least 100.
// In this case, integer rounding plays a marginal role in the percentages of
// optimality.
let partition_size = self.compute_optimal_partition_size(&zone_to_id)?;
let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?;
if old_assignment_opt != None {
msg.push("".into());
if old_assignment_opt.is_some() {
msg.push(format!(
"Optimal size of a partition: {} (was {} in the previous layout).",
"Optimal partition size: {} ({} in previous layout)",
ByteSize::b(partition_size).to_string_as(false),
ByteSize::b(self.partition_size).to_string_as(false)
));
} else {
msg.push(format!(
"Given the replication and redundancy constraints, the \
optimal size of a partition is {}.",
"Optimal partition size: {}",
ByteSize::b(partition_size).to_string_as(false)
));
}
@ -610,7 +657,8 @@ impl ClusterLayout {
// We compute a first flow/assignment that is heuristically close to the previous
// assignment
let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?;
let mut gflow =
self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?;
if let Some(assoc) = &old_assignment_opt {
// We minimize the distance to the previous assignment.
self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?;
@ -618,7 +666,6 @@ impl ClusterLayout {
// We display statistics of the computation
msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
msg.push("".to_string());
// We update the layout structure
self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
@ -645,7 +692,7 @@ impl ClusterLayout {
.roles
.items()
.iter()
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None))
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some()))
.map(|(k, _, _)| *k)
.collect();
@ -661,7 +708,7 @@ impl ClusterLayout {
.roles
.items()
.iter()
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None))
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none()))
.map(|(k, _, _)| *k)
.collect();
@ -723,7 +770,7 @@ impl ClusterLayout {
for uuid in self.nongateway_nodes().iter() {
let r = self.node_role(uuid).unwrap();
if !zone_to_id.contains_key(&r.zone) && r.capacity != None {
if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() {
zone_to_id.insert(r.zone.clone(), id_to_zone.len());
id_to_zone.push(r.zone.clone());
}
@ -736,9 +783,10 @@ impl ClusterLayout {
fn compute_optimal_partition_size(
&self,
zone_to_id: &HashMap<String, usize>,
zone_redundancy: usize,
) -> Result<u64, Error> {
let empty_set = HashSet::<(usize, usize)>::new();
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?;
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?;
g.compute_maximal_flow()?;
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
return Err(Error::Message(
@ -751,7 +799,12 @@ impl ClusterLayout {
let mut s_down = 1;
let mut s_up = self.get_total_capacity()?;
while s_down + 1 < s_up {
g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?;
g = self.generate_flow_graph(
(s_down + s_up) / 2,
zone_to_id,
&empty_set,
zone_redundancy,
)?;
g.compute_maximal_flow()?;
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
s_up = (s_down + s_up) / 2;
@ -790,18 +843,18 @@ impl ClusterLayout {
partition_size: u64,
zone_to_id: &HashMap<String, usize>,
exclude_assoc: &HashSet<(usize, usize)>,
zone_redundancy: usize,
) -> Result<Graph<FlowEdge>, Error> {
let vertices =
ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
let mut g = Graph::<FlowEdge>::new(&vertices);
let nb_zones = zone_to_id.len();
let redundancy = self.parameters.zone_redundancy;
for p in 0..NB_PARTITIONS {
g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?;
g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?;
g.add_edge(
Vertex::Source,
Vertex::Pdown(p),
(self.replication_factor - redundancy) as u64,
(self.replication_factor - zone_redundancy) as u64,
)?;
for z in 0..nb_zones {
g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?;
@ -830,6 +883,7 @@ impl ClusterLayout {
&self,
zone_to_id: &HashMap<String, usize>,
prev_assign_opt: &Option<Vec<Vec<usize>>>,
zone_redundancy: usize,
) -> Result<Graph<FlowEdge>, Error> {
// We list the (partition,node) associations that are not used in the
// previous assignment
@ -847,7 +901,12 @@ impl ClusterLayout {
}
// We compute the best flow using only the edges used in the previous assignment
let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?;
let mut g = self.generate_flow_graph(
self.partition_size,
zone_to_id,
&exclude_edge,
zone_redundancy,
)?;
g.compute_maximal_flow()?;
// We add the excluded edges and compute the maximal flow with the full graph.
@ -931,29 +990,33 @@ impl ClusterLayout {
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
let total_cap = self.get_total_capacity()?;
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
msg.push("".into());
msg.push(format!(
"Usable capacity / Total cluster capacity: {} / {} ({:.1} %)",
"Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
ByteSize::b(used_cap).to_string_as(false),
ByteSize::b(total_cap).to_string_as(false),
percent_cap
));
msg.push("".into());
msg.push(
"If the percentage is too low, it might be that the \
replication/redundancy constraints force the use of nodes/zones with small \
storage capacities. \
You might want to rebalance the storage capacities or relax the constraints. \
See the detailed statistics below and look for saturated nodes/zones."
.into(),
);
msg.push(format!(
"Recall that because of the replication factor, the actual available \
storage capacity is {} / {} = {}.",
ByteSize::b(used_cap).to_string_as(false),
"Effective capacity (replication factor {}): {}",
self.replication_factor,
ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false)
));
if percent_cap < 80. {
msg.push("".into());
msg.push(
"If the percentage is too low, it might be that the \
cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
storage capacities."
.into(),
);
msg.push(
"You might want to move storage capacity between zones or relax the redundancy constraint."
.into(),
);
msg.push(
"See the detailed statistics below and look for saturated nodes/zones.".into(),
);
}
// We define and fill in the following tables
let storing_nodes = self.nongateway_nodes();
@ -992,25 +1055,25 @@ impl ClusterLayout {
}
}
if *prev_assign_opt == None {
if prev_assign_opt.is_none() {
new_partitions = stored_partitions.clone();
new_partitions_zone = stored_partitions_zone.clone();
//new_partitions_zone = stored_partitions_zone.clone();
}
// We display the statistics
msg.push("".into());
if *prev_assign_opt != None {
if prev_assign_opt.is_some() {
let total_new_partitions: usize = new_partitions.iter().sum();
msg.push(format!(
"A total of {} new copies of partitions need to be \
transferred.",
total_new_partitions
));
msg.push("".into());
}
msg.push("".into());
msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into());
let mut table = vec![];
for z in 0..id_to_zone.len() {
let mut nodes_of_z = Vec::<usize>::new();
for n in 0..storing_nodes.len() {
@ -1020,15 +1083,9 @@ impl ClusterLayout {
}
let replicated_partitions: usize =
nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
msg.push("".into());
msg.push(format!(
"Zone {}: {} distinct partitions stored ({} new, \
{} partition copies) ",
id_to_zone[z],
stored_partitions_zone[z],
new_partitions_zone[z],
replicated_partitions
table.push(format!(
"{}\tTags\tPartitions\tCapacity\tUsable capacity",
id_to_zone[z]
));
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
@ -1037,33 +1094,35 @@ impl ClusterLayout {
total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
}
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
msg.push(format!(
" Usable capacity / Total capacity: {} / {} ({:.1}%).",
ByteSize::b(available_cap_z).to_string_as(false),
ByteSize::b(total_cap_z).to_string_as(false),
percent_cap_z
));
for n in nodes_of_z.iter() {
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
let tags_n = (self
.node_role(&self.node_id_vec[*n])
.ok_or("Node not found."))?
.tags_string();
msg.push(format!(
" Node {:?}: {} partitions ({} new) ; \
usable/total capacity: {} / {} ({:.1}%) ; tags:{}",
let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
table.push(format!(
" {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
self.node_id_vec[*n],
tags_n,
stored_partitions[*n],
new_partitions[*n],
ByteSize::b(available_cap_n).to_string_as(false),
ByteSize::b(total_cap_n).to_string_as(false),
ByteSize::b(available_cap_n).to_string_as(false),
(available_cap_n as f32) / (total_cap_n as f32) * 100.0,
tags_n
));
}
table.push(format!(
" TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)",
replicated_partitions,
stored_partitions_zone[z],
//new_partitions_zone[z],
ByteSize::b(total_cap_z).to_string_as(false),
ByteSize::b(available_cap_z).to_string_as(false),
percent_cap_z
));
table.push("".into());
}
msg.push(format_table::format_table_to_string(table));
Ok(msg)
}
@ -1125,7 +1184,7 @@ mod tests {
let mut curr_zone = 0;
let redundancy = cl.parameters.zone_redundancy;
let redundancy = cl.effective_zone_redundancy();
for replic in 0..cl.replication_factor {
for p in 0..NB_PARTITIONS {
@ -1177,8 +1236,9 @@ mod tests {
);
cl.staging_roles.merge(&update);
}
cl.staging_parameters
.update(LayoutParameters { zone_redundancy });
cl.staging_parameters.update(LayoutParameters {
zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
});
cl.staging_hash = cl.calculate_staging_hash();
}

View file

@ -668,7 +668,7 @@ impl System {
let prev_layout_check = layout.check().is_ok();
if layout.merge(adv) {
if prev_layout_check && !layout.check().is_ok() {
if prev_layout_check && layout.check().is_err() {
error!("New cluster layout is invalid, discarding.");
return Err(Error::Message(
"New cluster layout is invalid, discarding.".into(),
@ -724,7 +724,7 @@ impl System {
async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
while !*stop_signal.borrow() {
let not_configured = !self.ring.borrow().layout.check().is_ok();
let not_configured = self.ring.borrow().layout.check().is_err();
let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
let expected_n_nodes = self.ring.borrow().layout.num_nodes();
let bad_peers = self