Garage v0.9 #473
9 changed files with 438 additions and 180 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1370,6 +1370,7 @@ dependencies = [
|
|||
"bytes",
|
||||
"bytesize",
|
||||
"err-derive",
|
||||
"format_table",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"garage_db",
|
||||
|
|
|
@ -7,13 +7,13 @@ args@{
|
|||
"garage_db/default"
|
||||
"garage_util/default"
|
||||
"garage_rpc/default"
|
||||
"format_table/default"
|
||||
"garage_table/default"
|
||||
"garage_block/default"
|
||||
"garage_model/default"
|
||||
"garage_api/default"
|
||||
"garage_web/default"
|
||||
"garage/default"
|
||||
"format_table/default"
|
||||
"k2v-client/default"
|
||||
],
|
||||
rustPackages,
|
||||
|
@ -33,7 +33,7 @@ args@{
|
|||
ignoreLockHash,
|
||||
}:
|
||||
let
|
||||
nixifiedLockHash = "7bef0004fa84feec502c75d50632d54202c272d56d2549fc09e2a356141685bb";
|
||||
nixifiedLockHash = "5df33eefe787762bf831e92c723c153faf8d5910332dcdf2fd941fe03be59936";
|
||||
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
||||
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
||||
lockHashIgnored = if ignoreLockHash
|
||||
|
@ -60,13 +60,13 @@ in
|
|||
garage_db = rustPackages.unknown.garage_db."0.8.4";
|
||||
garage_util = rustPackages.unknown.garage_util."0.8.4";
|
||||
garage_rpc = rustPackages.unknown.garage_rpc."0.8.4";
|
||||
format_table = rustPackages.unknown.format_table."0.1.1";
|
||||
garage_table = rustPackages.unknown.garage_table."0.8.4";
|
||||
garage_block = rustPackages.unknown.garage_block."0.8.4";
|
||||
garage_model = rustPackages.unknown.garage_model."0.8.4";
|
||||
garage_api = rustPackages.unknown.garage_api."0.8.4";
|
||||
garage_web = rustPackages.unknown.garage_web."0.8.4";
|
||||
garage = rustPackages.unknown.garage."0.8.4";
|
||||
format_table = rustPackages.unknown.format_table."0.1.1";
|
||||
k2v-client = rustPackages.unknown.k2v-client."0.0.4";
|
||||
};
|
||||
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
|
||||
|
@ -1954,6 +1954,7 @@ in
|
|||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out;
|
||||
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
||||
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out;
|
||||
|
|
|
@ -9,18 +9,30 @@ a certain capacity, or a gateway node that does not store data and is only
|
|||
used as an API entry point for faster cluster access.
|
||||
An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page.
|
||||
|
||||
In Garage, all of the data that can be stored in a given cluster is divided
|
||||
into slices which we call *partitions*. Each partition is stored by
|
||||
one or several nodes in the cluster
|
||||
(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)).
|
||||
The layout determines the correspondence between these partition,
|
||||
which exist on a logical level, and actual storage nodes.
|
||||
|
||||
## How cluster layouts work in Garage
|
||||
|
||||
In Garage, a cluster layout is composed of the following components:
|
||||
A cluster layout is composed of the following components:
|
||||
|
||||
- a table of roles assigned to nodes
|
||||
- a table of roles assigned to nodes, defined by the user
|
||||
- an optimal assignation of partitions to nodes, computed by an algorithm that is ran once when calling `garage layout apply` or the ApplyClusterLayout API endpoint
|
||||
- a version number
|
||||
|
||||
Garage nodes will always use the cluster layout with the highest version number.
|
||||
|
||||
Garage nodes also maintain and synchronize between them a set of proposed role
|
||||
changes that haven't yet been applied. These changes will be applied (or
|
||||
canceled) in the next version of the layout
|
||||
canceled) in the next version of the layout.
|
||||
|
||||
All operations on the layout can be realized using the `garage` CLI or using the
|
||||
[administration API endpoint](@/documentation/reference-manual/admin-api.md).
|
||||
We give here a description of CLI commands, the admin API semantics are very similar.
|
||||
|
||||
The following commands insert modifications to the set of proposed role changes
|
||||
for the next layout version (but they do not create the new layout immediately):
|
||||
|
@ -51,7 +63,7 @@ commands will fail otherwise.
|
|||
|
||||
## Warnings about Garage cluster layout management
|
||||
|
||||
**Warning: never make several calls to `garage layout apply` or `garage layout
|
||||
**⚠️ Never make several calls to `garage layout apply` or `garage layout
|
||||
revert` with the same value of the `--version` flag. Doing so can lead to the
|
||||
creation of several different layouts with the same version number, in which
|
||||
case your Garage cluster will become inconsistent until fixed.** If a call to
|
||||
|
@ -65,13 +77,198 @@ shell, you shouldn't have much issues as long as you run commands one after
|
|||
the other and take care of checking the output of `garage layout show`
|
||||
before applying any changes.
|
||||
|
||||
If you are using the `garage` CLI to script layout changes, follow the following recommendations:
|
||||
If you are using the `garage` CLI or the admin API to script layout changes,
|
||||
follow the following recommendations:
|
||||
|
||||
- Make all of your `garage` CLI calls to the same RPC host. Do not use the
|
||||
`garage` CLI to connect to individual nodes to send them each a piece of the
|
||||
layout changes you are making, as the changes propagate asynchronously
|
||||
between nodes and might not all be taken into account at the time when the
|
||||
new layout is applied.
|
||||
- If using the CLI, make all of your `garage` CLI calls to the same RPC host.
|
||||
If using the admin API, make all of your API calls to the same Garage node. Do
|
||||
not connect to individual nodes to send them each a piece of the layout changes
|
||||
you are making, as the changes propagate asynchronously between nodes and might
|
||||
not all be taken into account at the time when the new layout is applied.
|
||||
|
||||
- **Only call `garage layout apply` once**, and call it **strictly after** all
|
||||
of the `layout assign` and `layout remove` commands have returned.
|
||||
- **Only call `garage layout apply`/ApplyClusterLayout once**, and call it
|
||||
**strictly after** all of the `layout assign` and `layout remove`
|
||||
commands/UpdateClusterLayout API calls have returned.
|
||||
|
||||
|
||||
## Understanding unexpected layout calculations
|
||||
|
||||
When adding, removing or modifying nodes in a cluster layout, sometimes
|
||||
unexpected assigntations of partitions to node can occur. These assignations
|
||||
are in fact normal and logical, given the objectives of the algorihtm. Indeed,
|
||||
**the layout algorithm prioritizes moving less data between nodes over the fact
|
||||
of achieving equal distribution of load. It also tries to use all links between
|
||||
pairs of nodes in equal proportions when moving data.** This section presents
|
||||
two examples and illustrates how one can control Garage's behavior to obtain
|
||||
the desired results.
|
||||
|
||||
### Example 1
|
||||
|
||||
In this example, a cluster is originally composed of 3 nodes in 3 different
|
||||
zones (data centers). The three nodes are of equal capacity, therefore they
|
||||
are all fully exploited and all store a copy of all of the data in the cluster.
|
||||
|
||||
Then, a fourth node of the same size is added in the datacenter `dc1`.
|
||||
As illustrated by the following, **Garage will by default not store any data on the new node**:
|
||||
|
||||
```
|
||||
$ garage layout show
|
||||
==== CURRENT CLUSTER LAYOUT ====
|
||||
ID Tags Zone Capacity Usable capacity
|
||||
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
|
||||
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
|
||||
|
||||
Zone redundancy: maximum
|
||||
|
||||
Current cluster layout version: 6
|
||||
|
||||
==== STAGED ROLE CHANGES ====
|
||||
ID Tags Zone Capacity
|
||||
a11c7cf18af29737 node4 dc1 1000.0 MB
|
||||
|
||||
|
||||
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
|
||||
ID Tags Zone Capacity Usable capacity
|
||||
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
|
||||
a11c7cf18af29737 node4 dc1 1000.0 MB 0 B (0.0%)
|
||||
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
|
||||
|
||||
Zone redundancy: maximum
|
||||
|
||||
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
|
||||
|
||||
Partitions are replicated 3 times on at least 3 distinct zones.
|
||||
|
||||
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
|
||||
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
|
||||
Effective capacity (replication factor 3): 1000.0 MB
|
||||
|
||||
A total of 0 new copies of partitions need to be transferred.
|
||||
|
||||
dc1 Tags Partitions Capacity Usable capacity
|
||||
b10c110e4e854e5a node1 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||
a11c7cf18af29737 node4 0 (0 new) 1000.0 MB 0 B (0.0%)
|
||||
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
|
||||
|
||||
dc2 Tags Partitions Capacity Usable capacity
|
||||
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||
|
||||
dc3 Tags Partitions Capacity Usable capacity
|
||||
62b218d848e86a64 node3 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||
```
|
||||
|
||||
While unexpected, this is logical because of the following facts:
|
||||
|
||||
- storing some data on the new node does not help increase the total quantity
|
||||
of data that can be stored on the cluster, as the two other zones (`dc2` and
|
||||
`dc3`) still need to store a full copy of everything, and their capacity is
|
||||
still the same;
|
||||
|
||||
- there is therefore no need to move any data on the new node as this would be pointless;
|
||||
|
||||
- moving data to the new node has a cost which the algorithm decides to not pay if not necessary.
|
||||
|
||||
This distribution of data can however not be what the administrator wanted: if
|
||||
they added a new node to `dc1`, it might be because the existing node is too
|
||||
slow, and they wish to divide its load by half. In that case, what they need to
|
||||
do to force Garage to distribute the data between the two nodes is to attribute
|
||||
only half of the capacity to each node in `dc1` (in our example, 500M instead of 1G).
|
||||
In that case, Garage would determine that to be able to store 1G in total, it
|
||||
would need to store 500M on the old node and 500M on the added one.
|
||||
|
||||
|
||||
### Example 2
|
||||
|
||||
The following example is a slightly different scenario, where `dc1` had two
|
||||
nodes that were used at 50%, and `dc2` and `dc3` each have one node that is
|
||||
100% used. All node capacities are the same.
|
||||
|
||||
Then, a node from `dc1` is moved into `dc3`. One could expect that the roles of
|
||||
`dc1` and `dc3` would simply be swapped: the remaining node in `dc1` would be
|
||||
used at 100%, and the two nodes now in `dc3` would be used at 50%. Instead,
|
||||
this happens:
|
||||
|
||||
```
|
||||
==== CURRENT CLUSTER LAYOUT ====
|
||||
ID Tags Zone Capacity Usable capacity
|
||||
b10c110e4e854e5a node1 dc1 1000.0 MB 500.0 MB (50.0%)
|
||||
a11c7cf18af29737 node4 dc1 1000.0 MB 500.0 MB (50.0%)
|
||||
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
|
||||
|
||||
Zone redundancy: maximum
|
||||
|
||||
Current cluster layout version: 8
|
||||
|
||||
==== STAGED ROLE CHANGES ====
|
||||
ID Tags Zone Capacity
|
||||
a11c7cf18af29737 node4 dc3 1000.0 MB
|
||||
|
||||
|
||||
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
|
||||
ID Tags Zone Capacity Usable capacity
|
||||
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
|
||||
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||
62b218d848e86a64 node3 dc3 1000.0 MB 753.9 MB (75.4%)
|
||||
a11c7cf18af29737 node4 dc3 1000.0 MB 246.1 MB (24.6%)
|
||||
|
||||
Zone redundancy: maximum
|
||||
|
||||
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
|
||||
|
||||
Partitions are replicated 3 times on at least 3 distinct zones.
|
||||
|
||||
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
|
||||
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
|
||||
Effective capacity (replication factor 3): 1000.0 MB
|
||||
|
||||
A total of 128 new copies of partitions need to be transferred.
|
||||
|
||||
dc1 Tags Partitions Capacity Usable capacity
|
||||
b10c110e4e854e5a node1 256 (128 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||
|
||||
dc2 Tags Partitions Capacity Usable capacity
|
||||
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||
|
||||
dc3 Tags Partitions Capacity Usable capacity
|
||||
62b218d848e86a64 node3 193 (0 new) 1000.0 MB 753.9 MB (75.4%)
|
||||
a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%)
|
||||
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
|
||||
```
|
||||
|
||||
As we can see, the node that was moved to `dc3` (node4) is only used at 25% (approximatively),
|
||||
whereas the node that was already in `dc3` (node3) is used at 75%.
|
||||
|
||||
This can be explained by the following:
|
||||
|
||||
- node1 will now be the only node remaining in `dc1`, thus it has to store all
|
||||
of the data in the cluster. Since it was storing only half of it before, it has
|
||||
to retrieve the other half from other nodes in the cluster.
|
||||
|
||||
- The data which it does not have is entirely stored by the other node that was
|
||||
in `dc1` and that is now in `dc3` (node4). There is also a copy of it on node2
|
||||
and node3 since both these nodes have a copy of everything.
|
||||
|
||||
- node3 and node4 are the two nodes that will now be in a datacenter that is
|
||||
under-utilized (`dc3`), this means that those are the two candidates from which
|
||||
data can be removed to be moved to node1.
|
||||
|
||||
- Garage will move data in equal proportions from all possible sources, in this
|
||||
case it means that it will tranfer 25% of the entire data set from node3 to
|
||||
node1 and another 25% from node4 to node1.
|
||||
|
||||
This explains why node3 ends with 75% utilization (100% from before minus 25%
|
||||
that is moved to node1), and node4 ends with 25% (50% from before minus 25%
|
||||
that is moved to node1).
|
||||
|
||||
This illustrates the second principle of the layout computation: **if there is
|
||||
a choice in moving data out of some nodes, then all links between pairs of
|
||||
nodes are used in equal proportions** (this is approximately true, there is
|
||||
randomness in the algorihtm to achieve this so there might be some small
|
||||
fluctuations, as we see above).
|
||||
|
|
|
@ -174,16 +174,12 @@ pub async fn cmd_show_layout(
|
|||
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
|
||||
println!("==== CURRENT CLUSTER LAYOUT ====");
|
||||
if !print_cluster_layout(&layout) {
|
||||
println!("No nodes currently have a role in the cluster.");
|
||||
println!("See `garage status` to view available nodes.");
|
||||
}
|
||||
print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
|
||||
println!();
|
||||
println!("Current cluster layout version: {}", layout.version);
|
||||
|
||||
let has_role_changes = print_staging_role_changes(&layout);
|
||||
let has_param_changes = print_staging_parameters_changes(&layout);
|
||||
if has_role_changes || has_param_changes {
|
||||
if has_role_changes {
|
||||
let v = layout.version;
|
||||
let res_apply = layout.apply_staged_changes(Some(v + 1));
|
||||
|
||||
|
@ -193,9 +189,7 @@ pub async fn cmd_show_layout(
|
|||
Ok((layout, msg)) => {
|
||||
println!();
|
||||
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
|
||||
if !print_cluster_layout(&layout) {
|
||||
println!("No nodes have a role in the new layout.");
|
||||
}
|
||||
print_cluster_layout(&layout, "No nodes have a role in the new layout.");
|
||||
println!();
|
||||
|
||||
for line in msg.iter() {
|
||||
|
@ -267,28 +261,35 @@ pub async fn cmd_config_layout(
|
|||
let mut did_something = false;
|
||||
match config_opt.redundancy {
|
||||
None => (),
|
||||
Some(r) => {
|
||||
if r > layout.replication_factor {
|
||||
println!(
|
||||
"The zone redundancy must be smaller or equal to the \
|
||||
replication factor ({}).",
|
||||
layout.replication_factor
|
||||
);
|
||||
} else if r < 1 {
|
||||
println!("The zone redundancy must be at least 1.");
|
||||
} else {
|
||||
layout
|
||||
.staging_parameters
|
||||
.update(LayoutParameters { zone_redundancy: r });
|
||||
println!("The new zone redundancy has been saved ({}).", r);
|
||||
Some(r_str) => {
|
||||
let r = r_str
|
||||
.parse::<ZoneRedundancy>()
|
||||
.ok_or_message("invalid zone redundancy value")?;
|
||||
if let ZoneRedundancy::AtLeast(r_int) = r {
|
||||
if r_int > layout.replication_factor {
|
||||
return Err(Error::Message(format!(
|
||||
"The zone redundancy must be smaller or equal to the \
|
||||
replication factor ({}).",
|
||||
layout.replication_factor
|
||||
)));
|
||||
} else if r_int < 1 {
|
||||
return Err(Error::Message(
|
||||
"The zone redundancy must be at least 1.".into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
layout
|
||||
.staging_parameters
|
||||
.update(LayoutParameters { zone_redundancy: r });
|
||||
println!("The zone redundancy parameter has been set to '{}'.", r);
|
||||
did_something = true;
|
||||
}
|
||||
}
|
||||
|
||||
if !did_something {
|
||||
return Err(Error::Message(
|
||||
"Please specify an action for `garage layout config` to do".into(),
|
||||
"Please specify an action for `garage layout config`".into(),
|
||||
));
|
||||
}
|
||||
|
||||
|
@ -326,7 +327,7 @@ pub async fn send_layout(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
|
||||
pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
|
||||
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
|
||||
for (id, _, role) in layout.roles.items().iter() {
|
||||
let role = match &role.0 {
|
||||
|
@ -356,61 +357,54 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
|
|||
));
|
||||
};
|
||||
}
|
||||
println!();
|
||||
println!("Parameters of the layout computation:");
|
||||
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
|
||||
println!();
|
||||
if table.len() == 1 {
|
||||
false
|
||||
} else {
|
||||
if table.len() > 1 {
|
||||
format_table(table);
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool {
|
||||
let has_changes = *layout.staging_parameters.get() != layout.parameters;
|
||||
if has_changes {
|
||||
println!();
|
||||
println!("==== NEW LAYOUT PARAMETERS ====");
|
||||
println!(
|
||||
"Zone redundancy: {}",
|
||||
layout.staging_parameters.get().zone_redundancy
|
||||
);
|
||||
println!();
|
||||
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
|
||||
} else {
|
||||
println!("{}", empty_msg);
|
||||
}
|
||||
has_changes
|
||||
}
|
||||
|
||||
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
|
||||
let has_changes = layout
|
||||
let has_role_changes = layout
|
||||
.staging_roles
|
||||
.items()
|
||||
.iter()
|
||||
.any(|(k, _, v)| layout.roles.get(k) != Some(v));
|
||||
let has_layout_changes = *layout.staging_parameters.get() != layout.parameters;
|
||||
|
||||
if has_changes {
|
||||
if has_role_changes || has_layout_changes {
|
||||
println!();
|
||||
println!("==== STAGED ROLE CHANGES ====");
|
||||
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
|
||||
for (id, _, role) in layout.staging_roles.items().iter() {
|
||||
if layout.roles.get(id) == Some(role) {
|
||||
continue;
|
||||
}
|
||||
if let Some(role) = &role.0 {
|
||||
let tags = role.tags.join(",");
|
||||
table.push(format!(
|
||||
"{:?}\t{}\t{}\t{}",
|
||||
id,
|
||||
tags,
|
||||
role.zone,
|
||||
role.capacity_string()
|
||||
));
|
||||
} else {
|
||||
table.push(format!("{:?}\tREMOVED", id));
|
||||
if has_role_changes {
|
||||
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
|
||||
for (id, _, role) in layout.staging_roles.items().iter() {
|
||||
if layout.roles.get(id) == Some(role) {
|
||||
continue;
|
||||
}
|
||||
if let Some(role) = &role.0 {
|
||||
let tags = role.tags.join(",");
|
||||
table.push(format!(
|
||||
"{:?}\t{}\t{}\t{}",
|
||||
id,
|
||||
tags,
|
||||
role.zone,
|
||||
role.capacity_string()
|
||||
));
|
||||
} else {
|
||||
table.push(format!("{:?}\tREMOVED", id));
|
||||
}
|
||||
}
|
||||
format_table(table);
|
||||
println!();
|
||||
}
|
||||
if has_layout_changes {
|
||||
println!(
|
||||
"Zone redundancy: {}",
|
||||
layout.staging_parameters.get().zone_redundancy
|
||||
);
|
||||
}
|
||||
format_table(table);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
|
|
|
@ -143,9 +143,9 @@ pub struct RemoveRoleOpt {
|
|||
|
||||
#[derive(StructOpt, Debug)]
|
||||
pub struct ConfigLayoutOpt {
|
||||
/// Zone redundancy parameter
|
||||
/// Zone redundancy parameter ('none'/'max' or integer)
|
||||
#[structopt(short = "r", long = "redundancy")]
|
||||
pub(crate) redundancy: Option<usize>,
|
||||
pub(crate) redundancy: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(StructOpt, Debug)]
|
||||
|
|
|
@ -14,6 +14,7 @@ path = "lib.rs"
|
|||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
format_table.workspace = true
|
||||
garage_db.workspace = true
|
||||
garage_util.workspace = true
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//! This module deals with graph algorithms.
|
||||
//! It is used in layout.rs to build the partition to node assignment.
|
||||
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::prelude::{SeedableRng, SliceRandom};
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::VecDeque;
|
||||
|
@ -143,7 +143,11 @@ impl Graph<FlowEdge> {
|
|||
/// This function shuffles the order of the edge lists. It keeps the ids of the
|
||||
/// reversed edges consistent.
|
||||
fn shuffle_edges(&mut self) {
|
||||
let mut rng = rand::thread_rng();
|
||||
// We use deterministic randomness so that the layout calculation algorihtm
|
||||
// will output the same thing every time it is run. This way, the results
|
||||
// pre-calculated in `garage layout show` will match exactly those used
|
||||
// in practice with `garage layout apply`
|
||||
let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]);
|
||||
for i in 0..self.graph.len() {
|
||||
self.graph[i].shuffle(&mut rng);
|
||||
// We need to update the ids of the reverse edges.
|
||||
|
@ -189,7 +193,7 @@ impl Graph<FlowEdge> {
|
|||
let mut fifo = VecDeque::new();
|
||||
fifo.push_back((idsource, 0));
|
||||
while let Some((id, lvl)) = fifo.pop_front() {
|
||||
if level[id] == None {
|
||||
if level[id].is_none() {
|
||||
// it means id has not yet been reached
|
||||
level[id] = Some(lvl);
|
||||
for edge in self.graph[id].iter() {
|
||||
|
@ -199,7 +203,7 @@ impl Graph<FlowEdge> {
|
|||
}
|
||||
}
|
||||
}
|
||||
if level[idsink] == None {
|
||||
if level[idsink].is_none() {
|
||||
// There is no residual flow
|
||||
break;
|
||||
}
|
||||
|
@ -383,7 +387,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
|
|||
for t in 0..forest.len() {
|
||||
let mut id = t;
|
||||
// while we are on a valid undiscovered node
|
||||
while time_of_discovery[id] == None {
|
||||
while time_of_discovery[id].is_none() {
|
||||
time_of_discovery[id] = Some(t);
|
||||
if let Some(i) = forest[id] {
|
||||
id = i;
|
||||
|
@ -391,7 +395,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if forest[id] != None && time_of_discovery[id] == Some(t) {
|
||||
if forest[id].is_some() && time_of_discovery[id] == Some(t) {
|
||||
// We discovered an id that we explored at this iteration t.
|
||||
// It means we are on a cycle
|
||||
let mut cy = vec![id; 1];
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
|
||||
use bytesize::ByteSize;
|
||||
use itertools::Itertools;
|
||||
|
@ -115,7 +116,16 @@ mod v09 {
|
|||
/// algorithm. It is stored as a Crdt.
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct LayoutParameters {
|
||||
pub zone_redundancy: usize,
|
||||
pub zone_redundancy: ZoneRedundancy,
|
||||
}
|
||||
|
||||
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
|
||||
/// of each partition on at least that number of different zones.
|
||||
/// Otherwise, copies will be stored on the maximum possible number of zones.
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub enum ZoneRedundancy {
|
||||
AtLeast(usize),
|
||||
Maximum,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::Migrate for ClusterLayout {
|
||||
|
@ -125,7 +135,6 @@ mod v09 {
|
|||
|
||||
fn migrate(previous: Self::Previous) -> Self {
|
||||
use itertools::Itertools;
|
||||
use std::collections::HashSet;
|
||||
|
||||
// In the old layout, capacities are in an arbitrary unit,
|
||||
// but in the new layout they are in bytes.
|
||||
|
@ -152,17 +161,10 @@ mod v09 {
|
|||
.min()
|
||||
.unwrap_or(0);
|
||||
|
||||
// Determine zone redundancy parameter
|
||||
let zone_redundancy = std::cmp::min(
|
||||
previous.replication_factor,
|
||||
roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str()))
|
||||
.collect::<HashSet<&str>>()
|
||||
.len(),
|
||||
);
|
||||
let parameters = LayoutParameters { zone_redundancy };
|
||||
// By default, zone_redundancy is maximum possible value
|
||||
let parameters = LayoutParameters {
|
||||
zone_redundancy: ZoneRedundancy::Maximum,
|
||||
};
|
||||
|
||||
let mut res = Self {
|
||||
version: previous.version,
|
||||
|
@ -193,7 +195,7 @@ mod v09 {
|
|||
..
|
||||
})) = role
|
||||
{
|
||||
*cap = *cap * mul;
|
||||
*cap *= mul;
|
||||
}
|
||||
new_roles.merge_raw(node, *ts, &role);
|
||||
}
|
||||
|
@ -224,15 +226,39 @@ impl NodeRole {
|
|||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZoneRedundancy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
ZoneRedundancy::Maximum => write!(f, "maximum"),
|
||||
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl core::str::FromStr for ZoneRedundancy {
|
||||
type Err = &'static str;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
|
||||
x => {
|
||||
let v = x
|
||||
.parse::<usize>()
|
||||
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
|
||||
Ok(ZoneRedundancy::AtLeast(v))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
|
||||
impl ClusterLayout {
|
||||
pub fn new(replication_factor: usize) -> Self {
|
||||
// We set the default zone redundancy to be equal to the replication factor,
|
||||
// i.e. as strict as possible.
|
||||
// We set the default zone redundancy to be Maximum, meaning that the maximum
|
||||
// possible value will be used depending on the cluster topology
|
||||
let parameters = LayoutParameters {
|
||||
zone_redundancy: replication_factor,
|
||||
zone_redundancy: ZoneRedundancy::Maximum,
|
||||
};
|
||||
let staging_parameters = Lww::<LayoutParameters>::new(parameters.clone());
|
||||
let staging_parameters = Lww::<LayoutParameters>::new(parameters);
|
||||
|
||||
let empty_lwwmap = LwwMap::new();
|
||||
|
||||
|
@ -296,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
|
||||
self.roles.merge(&self.staging_roles);
|
||||
self.roles.retain(|(_, _, v)| v.0.is_some());
|
||||
self.parameters = self.staging_parameters.get().clone();
|
||||
self.parameters = *self.staging_parameters.get();
|
||||
|
||||
self.staging_roles.clear();
|
||||
self.staging_hash = self.calculate_staging_hash();
|
||||
|
@ -325,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
}
|
||||
|
||||
self.staging_roles.clear();
|
||||
self.staging_parameters.update(self.parameters.clone());
|
||||
self.staging_parameters.update(self.parameters);
|
||||
self.staging_hash = self.calculate_staging_hash();
|
||||
|
||||
self.version += 1;
|
||||
|
@ -356,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
let mut result = Vec::<Uuid>::new();
|
||||
for uuid in self.node_id_vec.iter() {
|
||||
match self.node_role(uuid) {
|
||||
Some(role) if role.capacity != None => result.push(*uuid),
|
||||
Some(role) if role.capacity.is_some() => result.push(*uuid),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
Ok(total_capacity)
|
||||
}
|
||||
|
||||
/// Returns the effective value of the zone_redundancy parameter
|
||||
fn effective_zone_redundancy(&self) -> usize {
|
||||
match self.parameters.zone_redundancy {
|
||||
ZoneRedundancy::AtLeast(v) => v,
|
||||
ZoneRedundancy::Maximum => {
|
||||
let n_zones = self
|
||||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str()))
|
||||
.collect::<HashSet<&str>>()
|
||||
.len();
|
||||
std::cmp::min(n_zones, self.replication_factor)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check a cluster layout for internal consistency
|
||||
/// (assignment, roles, parameters, partition size)
|
||||
/// returns true if consistent, false if error
|
||||
|
@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
}
|
||||
|
||||
// Check that every partition is associated to distinct nodes
|
||||
let zone_redundancy = self.effective_zone_redundancy();
|
||||
let rf = self.replication_factor;
|
||||
for p in 0..(1 << PARTITION_BITS) {
|
||||
let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec();
|
||||
|
@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
.expect("Zone not found.")
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let redundancy = self.parameters.zone_redundancy;
|
||||
if zones_of_p.iter().unique().count() < redundancy {
|
||||
if zones_of_p.iter().unique().count() < zone_redundancy {
|
||||
return Err(format!(
|
||||
"nodes of partition are in less than {} distinct zones",
|
||||
redundancy
|
||||
zone_redundancy
|
||||
));
|
||||
}
|
||||
}
|
||||
|
@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
// algorithm.
|
||||
let cl2 = self.clone();
|
||||
let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap();
|
||||
match cl2.compute_optimal_partition_size(&zone_to_id) {
|
||||
match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) {
|
||||
Ok(s) if s != self.partition_size => {
|
||||
return Err(format!(
|
||||
"partition_size ({}) is different than optimal value ({})",
|
||||
|
@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
|||
}
|
||||
}
|
||||
|
||||
// ====================================================================================
|
||||
|
||||
// Implementation of the ClusterLayout methods related to the assignment algorithm.
|
||||
impl ClusterLayout {
|
||||
/// This function calculates a new partition-to-node assignment.
|
||||
|
@ -549,13 +594,15 @@ impl ClusterLayout {
|
|||
// changes in the layout. We retrieve the old_assignment reframed with new ids
|
||||
let old_assignment_opt = self.update_node_id_vec()?;
|
||||
|
||||
let zone_redundancy = self.effective_zone_redundancy();
|
||||
|
||||
let mut msg = Message::new();
|
||||
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
|
||||
msg.push("".into());
|
||||
msg.push(format!(
|
||||
"Partitions are \
|
||||
replicated {} times on at least {} distinct zones.",
|
||||
self.replication_factor, self.parameters.zone_redundancy
|
||||
self.replication_factor, zone_redundancy
|
||||
));
|
||||
|
||||
// We generate for once numerical ids for the zones of non gateway nodes,
|
||||
|
@ -570,12 +617,12 @@ impl ClusterLayout {
|
|||
nb_nongateway_nodes, self.replication_factor
|
||||
)));
|
||||
}
|
||||
if id_to_zone.len() < self.parameters.zone_redundancy {
|
||||
if id_to_zone.len() < zone_redundancy {
|
||||
return Err(Error::Message(format!(
|
||||
"The number of zones with non-gateway \
|
||||
nodes ({}) is smaller than the redundancy parameter ({})",
|
||||
id_to_zone.len(),
|
||||
self.parameters.zone_redundancy
|
||||
zone_redundancy
|
||||
)));
|
||||
}
|
||||
|
||||
|
@ -583,18 +630,18 @@ impl ClusterLayout {
|
|||
// Capacities should be given in a unit so that partition size is at least 100.
|
||||
// In this case, integer rounding plays a marginal role in the percentages of
|
||||
// optimality.
|
||||
let partition_size = self.compute_optimal_partition_size(&zone_to_id)?;
|
||||
let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?;
|
||||
|
||||
if old_assignment_opt != None {
|
||||
msg.push("".into());
|
||||
if old_assignment_opt.is_some() {
|
||||
msg.push(format!(
|
||||
"Optimal size of a partition: {} (was {} in the previous layout).",
|
||||
"Optimal partition size: {} ({} in previous layout)",
|
||||
ByteSize::b(partition_size).to_string_as(false),
|
||||
ByteSize::b(self.partition_size).to_string_as(false)
|
||||
));
|
||||
} else {
|
||||
msg.push(format!(
|
||||
"Given the replication and redundancy constraints, the \
|
||||
optimal size of a partition is {}.",
|
||||
"Optimal partition size: {}",
|
||||
ByteSize::b(partition_size).to_string_as(false)
|
||||
));
|
||||
}
|
||||
|
@ -610,7 +657,8 @@ impl ClusterLayout {
|
|||
|
||||
// We compute a first flow/assignment that is heuristically close to the previous
|
||||
// assignment
|
||||
let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?;
|
||||
let mut gflow =
|
||||
self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?;
|
||||
if let Some(assoc) = &old_assignment_opt {
|
||||
// We minimize the distance to the previous assignment.
|
||||
self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?;
|
||||
|
@ -618,7 +666,6 @@ impl ClusterLayout {
|
|||
|
||||
// We display statistics of the computation
|
||||
msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
|
||||
msg.push("".to_string());
|
||||
|
||||
// We update the layout structure
|
||||
self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
|
||||
|
@ -645,7 +692,7 @@ impl ClusterLayout {
|
|||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None))
|
||||
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some()))
|
||||
.map(|(k, _, _)| *k)
|
||||
.collect();
|
||||
|
||||
|
@ -661,7 +708,7 @@ impl ClusterLayout {
|
|||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None))
|
||||
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none()))
|
||||
.map(|(k, _, _)| *k)
|
||||
.collect();
|
||||
|
||||
|
@ -723,7 +770,7 @@ impl ClusterLayout {
|
|||
|
||||
for uuid in self.nongateway_nodes().iter() {
|
||||
let r = self.node_role(uuid).unwrap();
|
||||
if !zone_to_id.contains_key(&r.zone) && r.capacity != None {
|
||||
if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() {
|
||||
zone_to_id.insert(r.zone.clone(), id_to_zone.len());
|
||||
id_to_zone.push(r.zone.clone());
|
||||
}
|
||||
|
@ -736,9 +783,10 @@ impl ClusterLayout {
|
|||
fn compute_optimal_partition_size(
|
||||
&self,
|
||||
zone_to_id: &HashMap<String, usize>,
|
||||
zone_redundancy: usize,
|
||||
) -> Result<u64, Error> {
|
||||
let empty_set = HashSet::<(usize, usize)>::new();
|
||||
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?;
|
||||
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?;
|
||||
g.compute_maximal_flow()?;
|
||||
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
|
||||
return Err(Error::Message(
|
||||
|
@ -751,7 +799,12 @@ impl ClusterLayout {
|
|||
let mut s_down = 1;
|
||||
let mut s_up = self.get_total_capacity()?;
|
||||
while s_down + 1 < s_up {
|
||||
g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?;
|
||||
g = self.generate_flow_graph(
|
||||
(s_down + s_up) / 2,
|
||||
zone_to_id,
|
||||
&empty_set,
|
||||
zone_redundancy,
|
||||
)?;
|
||||
g.compute_maximal_flow()?;
|
||||
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
|
||||
s_up = (s_down + s_up) / 2;
|
||||
|
@ -790,18 +843,18 @@ impl ClusterLayout {
|
|||
partition_size: u64,
|
||||
zone_to_id: &HashMap<String, usize>,
|
||||
exclude_assoc: &HashSet<(usize, usize)>,
|
||||
zone_redundancy: usize,
|
||||
) -> Result<Graph<FlowEdge>, Error> {
|
||||
let vertices =
|
||||
ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
|
||||
let mut g = Graph::<FlowEdge>::new(&vertices);
|
||||
let nb_zones = zone_to_id.len();
|
||||
let redundancy = self.parameters.zone_redundancy;
|
||||
for p in 0..NB_PARTITIONS {
|
||||
g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?;
|
||||
g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?;
|
||||
g.add_edge(
|
||||
Vertex::Source,
|
||||
Vertex::Pdown(p),
|
||||
(self.replication_factor - redundancy) as u64,
|
||||
(self.replication_factor - zone_redundancy) as u64,
|
||||
)?;
|
||||
for z in 0..nb_zones {
|
||||
g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?;
|
||||
|
@ -830,6 +883,7 @@ impl ClusterLayout {
|
|||
&self,
|
||||
zone_to_id: &HashMap<String, usize>,
|
||||
prev_assign_opt: &Option<Vec<Vec<usize>>>,
|
||||
zone_redundancy: usize,
|
||||
) -> Result<Graph<FlowEdge>, Error> {
|
||||
// We list the (partition,node) associations that are not used in the
|
||||
// previous assignment
|
||||
|
@ -847,7 +901,12 @@ impl ClusterLayout {
|
|||
}
|
||||
|
||||
// We compute the best flow using only the edges used in the previous assignment
|
||||
let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?;
|
||||
let mut g = self.generate_flow_graph(
|
||||
self.partition_size,
|
||||
zone_to_id,
|
||||
&exclude_edge,
|
||||
zone_redundancy,
|
||||
)?;
|
||||
g.compute_maximal_flow()?;
|
||||
|
||||
// We add the excluded edges and compute the maximal flow with the full graph.
|
||||
|
@ -931,29 +990,33 @@ impl ClusterLayout {
|
|||
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
|
||||
let total_cap = self.get_total_capacity()?;
|
||||
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
|
||||
msg.push("".into());
|
||||
msg.push(format!(
|
||||
"Usable capacity / Total cluster capacity: {} / {} ({:.1} %)",
|
||||
"Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
|
||||
ByteSize::b(used_cap).to_string_as(false),
|
||||
ByteSize::b(total_cap).to_string_as(false),
|
||||
percent_cap
|
||||
));
|
||||
msg.push("".into());
|
||||
msg.push(
|
||||
"If the percentage is too low, it might be that the \
|
||||
replication/redundancy constraints force the use of nodes/zones with small \
|
||||
storage capacities. \
|
||||
You might want to rebalance the storage capacities or relax the constraints. \
|
||||
See the detailed statistics below and look for saturated nodes/zones."
|
||||
.into(),
|
||||
);
|
||||
msg.push(format!(
|
||||
"Recall that because of the replication factor, the actual available \
|
||||
storage capacity is {} / {} = {}.",
|
||||
ByteSize::b(used_cap).to_string_as(false),
|
||||
"Effective capacity (replication factor {}): {}",
|
||||
self.replication_factor,
|
||||
ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false)
|
||||
));
|
||||
if percent_cap < 80. {
|
||||
msg.push("".into());
|
||||
msg.push(
|
||||
"If the percentage is too low, it might be that the \
|
||||
cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
|
||||
storage capacities."
|
||||
.into(),
|
||||
);
|
||||
msg.push(
|
||||
"You might want to move storage capacity between zones or relax the redundancy constraint."
|
||||
.into(),
|
||||
);
|
||||
msg.push(
|
||||
"See the detailed statistics below and look for saturated nodes/zones.".into(),
|
||||
);
|
||||
}
|
||||
|
||||
// We define and fill in the following tables
|
||||
let storing_nodes = self.nongateway_nodes();
|
||||
|
@ -992,25 +1055,25 @@ impl ClusterLayout {
|
|||
}
|
||||
}
|
||||
|
||||
if *prev_assign_opt == None {
|
||||
if prev_assign_opt.is_none() {
|
||||
new_partitions = stored_partitions.clone();
|
||||
new_partitions_zone = stored_partitions_zone.clone();
|
||||
//new_partitions_zone = stored_partitions_zone.clone();
|
||||
}
|
||||
|
||||
// We display the statistics
|
||||
|
||||
msg.push("".into());
|
||||
if *prev_assign_opt != None {
|
||||
if prev_assign_opt.is_some() {
|
||||
let total_new_partitions: usize = new_partitions.iter().sum();
|
||||
msg.push(format!(
|
||||
"A total of {} new copies of partitions need to be \
|
||||
transferred.",
|
||||
total_new_partitions
|
||||
));
|
||||
msg.push("".into());
|
||||
}
|
||||
msg.push("".into());
|
||||
msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into());
|
||||
|
||||
let mut table = vec![];
|
||||
for z in 0..id_to_zone.len() {
|
||||
let mut nodes_of_z = Vec::<usize>::new();
|
||||
for n in 0..storing_nodes.len() {
|
||||
|
@ -1020,15 +1083,9 @@ impl ClusterLayout {
|
|||
}
|
||||
let replicated_partitions: usize =
|
||||
nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
|
||||
msg.push("".into());
|
||||
|
||||
msg.push(format!(
|
||||
"Zone {}: {} distinct partitions stored ({} new, \
|
||||
{} partition copies) ",
|
||||
id_to_zone[z],
|
||||
stored_partitions_zone[z],
|
||||
new_partitions_zone[z],
|
||||
replicated_partitions
|
||||
table.push(format!(
|
||||
"{}\tTags\tPartitions\tCapacity\tUsable capacity",
|
||||
id_to_zone[z]
|
||||
));
|
||||
|
||||
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
|
||||
|
@ -1037,33 +1094,35 @@ impl ClusterLayout {
|
|||
total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
|
||||
}
|
||||
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
|
||||
msg.push(format!(
|
||||
" Usable capacity / Total capacity: {} / {} ({:.1}%).",
|
||||
ByteSize::b(available_cap_z).to_string_as(false),
|
||||
ByteSize::b(total_cap_z).to_string_as(false),
|
||||
percent_cap_z
|
||||
));
|
||||
|
||||
for n in nodes_of_z.iter() {
|
||||
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
|
||||
let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
|
||||
let tags_n = (self
|
||||
.node_role(&self.node_id_vec[*n])
|
||||
.ok_or("Node not found."))?
|
||||
.tags_string();
|
||||
msg.push(format!(
|
||||
" Node {:?}: {} partitions ({} new) ; \
|
||||
usable/total capacity: {} / {} ({:.1}%) ; tags:{}",
|
||||
let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
|
||||
table.push(format!(
|
||||
" {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
|
||||
self.node_id_vec[*n],
|
||||
tags_n,
|
||||
stored_partitions[*n],
|
||||
new_partitions[*n],
|
||||
ByteSize::b(available_cap_n).to_string_as(false),
|
||||
ByteSize::b(total_cap_n).to_string_as(false),
|
||||
ByteSize::b(available_cap_n).to_string_as(false),
|
||||
(available_cap_n as f32) / (total_cap_n as f32) * 100.0,
|
||||
tags_n
|
||||
));
|
||||
}
|
||||
|
||||
table.push(format!(
|
||||
" TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)",
|
||||
replicated_partitions,
|
||||
stored_partitions_zone[z],
|
||||
//new_partitions_zone[z],
|
||||
ByteSize::b(total_cap_z).to_string_as(false),
|
||||
ByteSize::b(available_cap_z).to_string_as(false),
|
||||
percent_cap_z
|
||||
));
|
||||
table.push("".into());
|
||||
}
|
||||
msg.push(format_table::format_table_to_string(table));
|
||||
|
||||
Ok(msg)
|
||||
}
|
||||
|
@ -1125,7 +1184,7 @@ mod tests {
|
|||
|
||||
let mut curr_zone = 0;
|
||||
|
||||
let redundancy = cl.parameters.zone_redundancy;
|
||||
let redundancy = cl.effective_zone_redundancy();
|
||||
|
||||
for replic in 0..cl.replication_factor {
|
||||
for p in 0..NB_PARTITIONS {
|
||||
|
@ -1177,8 +1236,9 @@ mod tests {
|
|||
);
|
||||
cl.staging_roles.merge(&update);
|
||||
}
|
||||
cl.staging_parameters
|
||||
.update(LayoutParameters { zone_redundancy });
|
||||
cl.staging_parameters.update(LayoutParameters {
|
||||
zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
|
||||
});
|
||||
cl.staging_hash = cl.calculate_staging_hash();
|
||||
}
|
||||
|
||||
|
|
|
@ -668,7 +668,7 @@ impl System {
|
|||
|
||||
let prev_layout_check = layout.check().is_ok();
|
||||
if layout.merge(adv) {
|
||||
if prev_layout_check && !layout.check().is_ok() {
|
||||
if prev_layout_check && layout.check().is_err() {
|
||||
error!("New cluster layout is invalid, discarding.");
|
||||
return Err(Error::Message(
|
||||
"New cluster layout is invalid, discarding.".into(),
|
||||
|
@ -724,7 +724,7 @@ impl System {
|
|||
|
||||
async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
|
||||
while !*stop_signal.borrow() {
|
||||
let not_configured = !self.ring.borrow().layout.check().is_ok();
|
||||
let not_configured = self.ring.borrow().layout.check().is_err();
|
||||
let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
|
||||
let expected_n_nodes = self.ring.borrow().layout.num_nodes();
|
||||
let bad_peers = self
|
||||
|
|
Loading…
Reference in a new issue