Garage v0.9 #473
9 changed files with 438 additions and 180 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1370,6 +1370,7 @@ dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"bytesize",
|
"bytesize",
|
||||||
"err-derive",
|
"err-derive",
|
||||||
|
"format_table",
|
||||||
"futures",
|
"futures",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"garage_db",
|
"garage_db",
|
||||||
|
|
|
@ -7,13 +7,13 @@ args@{
|
||||||
"garage_db/default"
|
"garage_db/default"
|
||||||
"garage_util/default"
|
"garage_util/default"
|
||||||
"garage_rpc/default"
|
"garage_rpc/default"
|
||||||
|
"format_table/default"
|
||||||
"garage_table/default"
|
"garage_table/default"
|
||||||
"garage_block/default"
|
"garage_block/default"
|
||||||
"garage_model/default"
|
"garage_model/default"
|
||||||
"garage_api/default"
|
"garage_api/default"
|
||||||
"garage_web/default"
|
"garage_web/default"
|
||||||
"garage/default"
|
"garage/default"
|
||||||
"format_table/default"
|
|
||||||
"k2v-client/default"
|
"k2v-client/default"
|
||||||
],
|
],
|
||||||
rustPackages,
|
rustPackages,
|
||||||
|
@ -33,7 +33,7 @@ args@{
|
||||||
ignoreLockHash,
|
ignoreLockHash,
|
||||||
}:
|
}:
|
||||||
let
|
let
|
||||||
nixifiedLockHash = "7bef0004fa84feec502c75d50632d54202c272d56d2549fc09e2a356141685bb";
|
nixifiedLockHash = "5df33eefe787762bf831e92c723c153faf8d5910332dcdf2fd941fe03be59936";
|
||||||
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
||||||
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
||||||
lockHashIgnored = if ignoreLockHash
|
lockHashIgnored = if ignoreLockHash
|
||||||
|
@ -60,13 +60,13 @@ in
|
||||||
garage_db = rustPackages.unknown.garage_db."0.8.4";
|
garage_db = rustPackages.unknown.garage_db."0.8.4";
|
||||||
garage_util = rustPackages.unknown.garage_util."0.8.4";
|
garage_util = rustPackages.unknown.garage_util."0.8.4";
|
||||||
garage_rpc = rustPackages.unknown.garage_rpc."0.8.4";
|
garage_rpc = rustPackages.unknown.garage_rpc."0.8.4";
|
||||||
|
format_table = rustPackages.unknown.format_table."0.1.1";
|
||||||
garage_table = rustPackages.unknown.garage_table."0.8.4";
|
garage_table = rustPackages.unknown.garage_table."0.8.4";
|
||||||
garage_block = rustPackages.unknown.garage_block."0.8.4";
|
garage_block = rustPackages.unknown.garage_block."0.8.4";
|
||||||
garage_model = rustPackages.unknown.garage_model."0.8.4";
|
garage_model = rustPackages.unknown.garage_model."0.8.4";
|
||||||
garage_api = rustPackages.unknown.garage_api."0.8.4";
|
garage_api = rustPackages.unknown.garage_api."0.8.4";
|
||||||
garage_web = rustPackages.unknown.garage_web."0.8.4";
|
garage_web = rustPackages.unknown.garage_web."0.8.4";
|
||||||
garage = rustPackages.unknown.garage."0.8.4";
|
garage = rustPackages.unknown.garage."0.8.4";
|
||||||
format_table = rustPackages.unknown.format_table."0.1.1";
|
|
||||||
k2v-client = rustPackages.unknown.k2v-client."0.0.4";
|
k2v-client = rustPackages.unknown.k2v-client."0.0.4";
|
||||||
};
|
};
|
||||||
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
|
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
|
||||||
|
@ -1954,6 +1954,7 @@ in
|
||||||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out;
|
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out;
|
||||||
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
|
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
|
||||||
${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
||||||
|
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
|
||||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out;
|
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out;
|
||||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out;
|
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out;
|
||||||
garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out;
|
garage_db = (rustPackages."unknown".garage_db."0.8.4" { inherit profileName; }).out;
|
||||||
|
|
|
@ -9,18 +9,30 @@ a certain capacity, or a gateway node that does not store data and is only
|
||||||
used as an API entry point for faster cluster access.
|
used as an API entry point for faster cluster access.
|
||||||
An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page.
|
An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page.
|
||||||
|
|
||||||
|
In Garage, all of the data that can be stored in a given cluster is divided
|
||||||
|
into slices which we call *partitions*. Each partition is stored by
|
||||||
|
one or several nodes in the cluster
|
||||||
|
(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)).
|
||||||
|
The layout determines the correspondence between these partition,
|
||||||
|
which exist on a logical level, and actual storage nodes.
|
||||||
|
|
||||||
## How cluster layouts work in Garage
|
## How cluster layouts work in Garage
|
||||||
|
|
||||||
In Garage, a cluster layout is composed of the following components:
|
A cluster layout is composed of the following components:
|
||||||
|
|
||||||
- a table of roles assigned to nodes
|
- a table of roles assigned to nodes, defined by the user
|
||||||
|
- an optimal assignation of partitions to nodes, computed by an algorithm that is ran once when calling `garage layout apply` or the ApplyClusterLayout API endpoint
|
||||||
- a version number
|
- a version number
|
||||||
|
|
||||||
Garage nodes will always use the cluster layout with the highest version number.
|
Garage nodes will always use the cluster layout with the highest version number.
|
||||||
|
|
||||||
Garage nodes also maintain and synchronize between them a set of proposed role
|
Garage nodes also maintain and synchronize between them a set of proposed role
|
||||||
changes that haven't yet been applied. These changes will be applied (or
|
changes that haven't yet been applied. These changes will be applied (or
|
||||||
canceled) in the next version of the layout
|
canceled) in the next version of the layout.
|
||||||
|
|
||||||
|
All operations on the layout can be realized using the `garage` CLI or using the
|
||||||
|
[administration API endpoint](@/documentation/reference-manual/admin-api.md).
|
||||||
|
We give here a description of CLI commands, the admin API semantics are very similar.
|
||||||
|
|
||||||
The following commands insert modifications to the set of proposed role changes
|
The following commands insert modifications to the set of proposed role changes
|
||||||
for the next layout version (but they do not create the new layout immediately):
|
for the next layout version (but they do not create the new layout immediately):
|
||||||
|
@ -51,7 +63,7 @@ commands will fail otherwise.
|
||||||
|
|
||||||
## Warnings about Garage cluster layout management
|
## Warnings about Garage cluster layout management
|
||||||
|
|
||||||
**Warning: never make several calls to `garage layout apply` or `garage layout
|
**⚠️ Never make several calls to `garage layout apply` or `garage layout
|
||||||
revert` with the same value of the `--version` flag. Doing so can lead to the
|
revert` with the same value of the `--version` flag. Doing so can lead to the
|
||||||
creation of several different layouts with the same version number, in which
|
creation of several different layouts with the same version number, in which
|
||||||
case your Garage cluster will become inconsistent until fixed.** If a call to
|
case your Garage cluster will become inconsistent until fixed.** If a call to
|
||||||
|
@ -65,13 +77,198 @@ shell, you shouldn't have much issues as long as you run commands one after
|
||||||
the other and take care of checking the output of `garage layout show`
|
the other and take care of checking the output of `garage layout show`
|
||||||
before applying any changes.
|
before applying any changes.
|
||||||
|
|
||||||
If you are using the `garage` CLI to script layout changes, follow the following recommendations:
|
If you are using the `garage` CLI or the admin API to script layout changes,
|
||||||
|
follow the following recommendations:
|
||||||
|
|
||||||
- Make all of your `garage` CLI calls to the same RPC host. Do not use the
|
- If using the CLI, make all of your `garage` CLI calls to the same RPC host.
|
||||||
`garage` CLI to connect to individual nodes to send them each a piece of the
|
If using the admin API, make all of your API calls to the same Garage node. Do
|
||||||
layout changes you are making, as the changes propagate asynchronously
|
not connect to individual nodes to send them each a piece of the layout changes
|
||||||
between nodes and might not all be taken into account at the time when the
|
you are making, as the changes propagate asynchronously between nodes and might
|
||||||
new layout is applied.
|
not all be taken into account at the time when the new layout is applied.
|
||||||
|
|
||||||
- **Only call `garage layout apply` once**, and call it **strictly after** all
|
- **Only call `garage layout apply`/ApplyClusterLayout once**, and call it
|
||||||
of the `layout assign` and `layout remove` commands have returned.
|
**strictly after** all of the `layout assign` and `layout remove`
|
||||||
|
commands/UpdateClusterLayout API calls have returned.
|
||||||
|
|
||||||
|
|
||||||
|
## Understanding unexpected layout calculations
|
||||||
|
|
||||||
|
When adding, removing or modifying nodes in a cluster layout, sometimes
|
||||||
|
unexpected assigntations of partitions to node can occur. These assignations
|
||||||
|
are in fact normal and logical, given the objectives of the algorihtm. Indeed,
|
||||||
|
**the layout algorithm prioritizes moving less data between nodes over the fact
|
||||||
|
of achieving equal distribution of load. It also tries to use all links between
|
||||||
|
pairs of nodes in equal proportions when moving data.** This section presents
|
||||||
|
two examples and illustrates how one can control Garage's behavior to obtain
|
||||||
|
the desired results.
|
||||||
|
|
||||||
|
### Example 1
|
||||||
|
|
||||||
|
In this example, a cluster is originally composed of 3 nodes in 3 different
|
||||||
|
zones (data centers). The three nodes are of equal capacity, therefore they
|
||||||
|
are all fully exploited and all store a copy of all of the data in the cluster.
|
||||||
|
|
||||||
|
Then, a fourth node of the same size is added in the datacenter `dc1`.
|
||||||
|
As illustrated by the following, **Garage will by default not store any data on the new node**:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ garage layout show
|
||||||
|
==== CURRENT CLUSTER LAYOUT ====
|
||||||
|
ID Tags Zone Capacity Usable capacity
|
||||||
|
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
|
||||||
|
Zone redundancy: maximum
|
||||||
|
|
||||||
|
Current cluster layout version: 6
|
||||||
|
|
||||||
|
==== STAGED ROLE CHANGES ====
|
||||||
|
ID Tags Zone Capacity
|
||||||
|
a11c7cf18af29737 node4 dc1 1000.0 MB
|
||||||
|
|
||||||
|
|
||||||
|
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
|
||||||
|
ID Tags Zone Capacity Usable capacity
|
||||||
|
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
a11c7cf18af29737 node4 dc1 1000.0 MB 0 B (0.0%)
|
||||||
|
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
|
||||||
|
Zone redundancy: maximum
|
||||||
|
|
||||||
|
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
|
||||||
|
|
||||||
|
Partitions are replicated 3 times on at least 3 distinct zones.
|
||||||
|
|
||||||
|
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
|
||||||
|
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
|
||||||
|
Effective capacity (replication factor 3): 1000.0 MB
|
||||||
|
|
||||||
|
A total of 0 new copies of partitions need to be transferred.
|
||||||
|
|
||||||
|
dc1 Tags Partitions Capacity Usable capacity
|
||||||
|
b10c110e4e854e5a node1 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
a11c7cf18af29737 node4 0 (0 new) 1000.0 MB 0 B (0.0%)
|
||||||
|
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
|
||||||
|
|
||||||
|
dc2 Tags Partitions Capacity Usable capacity
|
||||||
|
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
|
||||||
|
dc3 Tags Partitions Capacity Usable capacity
|
||||||
|
62b218d848e86a64 node3 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
```
|
||||||
|
|
||||||
|
While unexpected, this is logical because of the following facts:
|
||||||
|
|
||||||
|
- storing some data on the new node does not help increase the total quantity
|
||||||
|
of data that can be stored on the cluster, as the two other zones (`dc2` and
|
||||||
|
`dc3`) still need to store a full copy of everything, and their capacity is
|
||||||
|
still the same;
|
||||||
|
|
||||||
|
- there is therefore no need to move any data on the new node as this would be pointless;
|
||||||
|
|
||||||
|
- moving data to the new node has a cost which the algorithm decides to not pay if not necessary.
|
||||||
|
|
||||||
|
This distribution of data can however not be what the administrator wanted: if
|
||||||
|
they added a new node to `dc1`, it might be because the existing node is too
|
||||||
|
slow, and they wish to divide its load by half. In that case, what they need to
|
||||||
|
do to force Garage to distribute the data between the two nodes is to attribute
|
||||||
|
only half of the capacity to each node in `dc1` (in our example, 500M instead of 1G).
|
||||||
|
In that case, Garage would determine that to be able to store 1G in total, it
|
||||||
|
would need to store 500M on the old node and 500M on the added one.
|
||||||
|
|
||||||
|
|
||||||
|
### Example 2
|
||||||
|
|
||||||
|
The following example is a slightly different scenario, where `dc1` had two
|
||||||
|
nodes that were used at 50%, and `dc2` and `dc3` each have one node that is
|
||||||
|
100% used. All node capacities are the same.
|
||||||
|
|
||||||
|
Then, a node from `dc1` is moved into `dc3`. One could expect that the roles of
|
||||||
|
`dc1` and `dc3` would simply be swapped: the remaining node in `dc1` would be
|
||||||
|
used at 100%, and the two nodes now in `dc3` would be used at 50%. Instead,
|
||||||
|
this happens:
|
||||||
|
|
||||||
|
```
|
||||||
|
==== CURRENT CLUSTER LAYOUT ====
|
||||||
|
ID Tags Zone Capacity Usable capacity
|
||||||
|
b10c110e4e854e5a node1 dc1 1000.0 MB 500.0 MB (50.0%)
|
||||||
|
a11c7cf18af29737 node4 dc1 1000.0 MB 500.0 MB (50.0%)
|
||||||
|
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
|
||||||
|
Zone redundancy: maximum
|
||||||
|
|
||||||
|
Current cluster layout version: 8
|
||||||
|
|
||||||
|
==== STAGED ROLE CHANGES ====
|
||||||
|
ID Tags Zone Capacity
|
||||||
|
a11c7cf18af29737 node4 dc3 1000.0 MB
|
||||||
|
|
||||||
|
|
||||||
|
==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====
|
||||||
|
ID Tags Zone Capacity Usable capacity
|
||||||
|
b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
62b218d848e86a64 node3 dc3 1000.0 MB 753.9 MB (75.4%)
|
||||||
|
a11c7cf18af29737 node4 dc3 1000.0 MB 246.1 MB (24.6%)
|
||||||
|
|
||||||
|
Zone redundancy: maximum
|
||||||
|
|
||||||
|
==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====
|
||||||
|
|
||||||
|
Partitions are replicated 3 times on at least 3 distinct zones.
|
||||||
|
|
||||||
|
Optimal partition size: 3.9 MB (3.9 MB in previous layout)
|
||||||
|
Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %)
|
||||||
|
Effective capacity (replication factor 3): 1000.0 MB
|
||||||
|
|
||||||
|
A total of 128 new copies of partitions need to be transferred.
|
||||||
|
|
||||||
|
dc1 Tags Partitions Capacity Usable capacity
|
||||||
|
b10c110e4e854e5a node1 256 (128 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
|
||||||
|
dc2 Tags Partitions Capacity Usable capacity
|
||||||
|
a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%)
|
||||||
|
|
||||||
|
dc3 Tags Partitions Capacity Usable capacity
|
||||||
|
62b218d848e86a64 node3 193 (0 new) 1000.0 MB 753.9 MB (75.4%)
|
||||||
|
a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%)
|
||||||
|
TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%)
|
||||||
|
```
|
||||||
|
|
||||||
|
As we can see, the node that was moved to `dc3` (node4) is only used at 25% (approximatively),
|
||||||
|
whereas the node that was already in `dc3` (node3) is used at 75%.
|
||||||
|
|
||||||
|
This can be explained by the following:
|
||||||
|
|
||||||
|
- node1 will now be the only node remaining in `dc1`, thus it has to store all
|
||||||
|
of the data in the cluster. Since it was storing only half of it before, it has
|
||||||
|
to retrieve the other half from other nodes in the cluster.
|
||||||
|
|
||||||
|
- The data which it does not have is entirely stored by the other node that was
|
||||||
|
in `dc1` and that is now in `dc3` (node4). There is also a copy of it on node2
|
||||||
|
and node3 since both these nodes have a copy of everything.
|
||||||
|
|
||||||
|
- node3 and node4 are the two nodes that will now be in a datacenter that is
|
||||||
|
under-utilized (`dc3`), this means that those are the two candidates from which
|
||||||
|
data can be removed to be moved to node1.
|
||||||
|
|
||||||
|
- Garage will move data in equal proportions from all possible sources, in this
|
||||||
|
case it means that it will tranfer 25% of the entire data set from node3 to
|
||||||
|
node1 and another 25% from node4 to node1.
|
||||||
|
|
||||||
|
This explains why node3 ends with 75% utilization (100% from before minus 25%
|
||||||
|
that is moved to node1), and node4 ends with 25% (50% from before minus 25%
|
||||||
|
that is moved to node1).
|
||||||
|
|
||||||
|
This illustrates the second principle of the layout computation: **if there is
|
||||||
|
a choice in moving data out of some nodes, then all links between pairs of
|
||||||
|
nodes are used in equal proportions** (this is approximately true, there is
|
||||||
|
randomness in the algorihtm to achieve this so there might be some small
|
||||||
|
fluctuations, as we see above).
|
||||||
|
|
|
@ -174,16 +174,12 @@ pub async fn cmd_show_layout(
|
||||||
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||||
|
|
||||||
println!("==== CURRENT CLUSTER LAYOUT ====");
|
println!("==== CURRENT CLUSTER LAYOUT ====");
|
||||||
if !print_cluster_layout(&layout) {
|
print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
|
||||||
println!("No nodes currently have a role in the cluster.");
|
|
||||||
println!("See `garage status` to view available nodes.");
|
|
||||||
}
|
|
||||||
println!();
|
println!();
|
||||||
println!("Current cluster layout version: {}", layout.version);
|
println!("Current cluster layout version: {}", layout.version);
|
||||||
|
|
||||||
let has_role_changes = print_staging_role_changes(&layout);
|
let has_role_changes = print_staging_role_changes(&layout);
|
||||||
let has_param_changes = print_staging_parameters_changes(&layout);
|
if has_role_changes {
|
||||||
if has_role_changes || has_param_changes {
|
|
||||||
let v = layout.version;
|
let v = layout.version;
|
||||||
let res_apply = layout.apply_staged_changes(Some(v + 1));
|
let res_apply = layout.apply_staged_changes(Some(v + 1));
|
||||||
|
|
||||||
|
@ -193,9 +189,7 @@ pub async fn cmd_show_layout(
|
||||||
Ok((layout, msg)) => {
|
Ok((layout, msg)) => {
|
||||||
println!();
|
println!();
|
||||||
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
|
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
|
||||||
if !print_cluster_layout(&layout) {
|
print_cluster_layout(&layout, "No nodes have a role in the new layout.");
|
||||||
println!("No nodes have a role in the new layout.");
|
|
||||||
}
|
|
||||||
println!();
|
println!();
|
||||||
|
|
||||||
for line in msg.iter() {
|
for line in msg.iter() {
|
||||||
|
@ -267,28 +261,35 @@ pub async fn cmd_config_layout(
|
||||||
let mut did_something = false;
|
let mut did_something = false;
|
||||||
match config_opt.redundancy {
|
match config_opt.redundancy {
|
||||||
None => (),
|
None => (),
|
||||||
Some(r) => {
|
Some(r_str) => {
|
||||||
if r > layout.replication_factor {
|
let r = r_str
|
||||||
println!(
|
.parse::<ZoneRedundancy>()
|
||||||
"The zone redundancy must be smaller or equal to the \
|
.ok_or_message("invalid zone redundancy value")?;
|
||||||
replication factor ({}).",
|
if let ZoneRedundancy::AtLeast(r_int) = r {
|
||||||
layout.replication_factor
|
if r_int > layout.replication_factor {
|
||||||
);
|
return Err(Error::Message(format!(
|
||||||
} else if r < 1 {
|
"The zone redundancy must be smaller or equal to the \
|
||||||
println!("The zone redundancy must be at least 1.");
|
replication factor ({}).",
|
||||||
} else {
|
layout.replication_factor
|
||||||
layout
|
)));
|
||||||
.staging_parameters
|
} else if r_int < 1 {
|
||||||
.update(LayoutParameters { zone_redundancy: r });
|
return Err(Error::Message(
|
||||||
println!("The new zone redundancy has been saved ({}).", r);
|
"The zone redundancy must be at least 1.".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
layout
|
||||||
|
.staging_parameters
|
||||||
|
.update(LayoutParameters { zone_redundancy: r });
|
||||||
|
println!("The zone redundancy parameter has been set to '{}'.", r);
|
||||||
did_something = true;
|
did_something = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !did_something {
|
if !did_something {
|
||||||
return Err(Error::Message(
|
return Err(Error::Message(
|
||||||
"Please specify an action for `garage layout config` to do".into(),
|
"Please specify an action for `garage layout config`".into(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -326,7 +327,7 @@ pub async fn send_layout(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
|
pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
|
||||||
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
|
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
|
||||||
for (id, _, role) in layout.roles.items().iter() {
|
for (id, _, role) in layout.roles.items().iter() {
|
||||||
let role = match &role.0 {
|
let role = match &role.0 {
|
||||||
|
@ -356,61 +357,54 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
|
||||||
));
|
));
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
println!();
|
if table.len() > 1 {
|
||||||
println!("Parameters of the layout computation:");
|
|
||||||
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
|
|
||||||
println!();
|
|
||||||
if table.len() == 1 {
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
format_table(table);
|
format_table(table);
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool {
|
|
||||||
let has_changes = *layout.staging_parameters.get() != layout.parameters;
|
|
||||||
if has_changes {
|
|
||||||
println!();
|
|
||||||
println!("==== NEW LAYOUT PARAMETERS ====");
|
|
||||||
println!(
|
|
||||||
"Zone redundancy: {}",
|
|
||||||
layout.staging_parameters.get().zone_redundancy
|
|
||||||
);
|
|
||||||
println!();
|
println!();
|
||||||
|
println!("Zone redundancy: {}", layout.parameters.zone_redundancy);
|
||||||
|
} else {
|
||||||
|
println!("{}", empty_msg);
|
||||||
}
|
}
|
||||||
has_changes
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
|
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
|
||||||
let has_changes = layout
|
let has_role_changes = layout
|
||||||
.staging_roles
|
.staging_roles
|
||||||
.items()
|
.items()
|
||||||
.iter()
|
.iter()
|
||||||
.any(|(k, _, v)| layout.roles.get(k) != Some(v));
|
.any(|(k, _, v)| layout.roles.get(k) != Some(v));
|
||||||
|
let has_layout_changes = *layout.staging_parameters.get() != layout.parameters;
|
||||||
|
|
||||||
if has_changes {
|
if has_role_changes || has_layout_changes {
|
||||||
println!();
|
println!();
|
||||||
println!("==== STAGED ROLE CHANGES ====");
|
println!("==== STAGED ROLE CHANGES ====");
|
||||||
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
|
if has_role_changes {
|
||||||
for (id, _, role) in layout.staging_roles.items().iter() {
|
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
|
||||||
if layout.roles.get(id) == Some(role) {
|
for (id, _, role) in layout.staging_roles.items().iter() {
|
||||||
continue;
|
if layout.roles.get(id) == Some(role) {
|
||||||
}
|
continue;
|
||||||
if let Some(role) = &role.0 {
|
}
|
||||||
let tags = role.tags.join(",");
|
if let Some(role) = &role.0 {
|
||||||
table.push(format!(
|
let tags = role.tags.join(",");
|
||||||
"{:?}\t{}\t{}\t{}",
|
table.push(format!(
|
||||||
id,
|
"{:?}\t{}\t{}\t{}",
|
||||||
tags,
|
id,
|
||||||
role.zone,
|
tags,
|
||||||
role.capacity_string()
|
role.zone,
|
||||||
));
|
role.capacity_string()
|
||||||
} else {
|
));
|
||||||
table.push(format!("{:?}\tREMOVED", id));
|
} else {
|
||||||
|
table.push(format!("{:?}\tREMOVED", id));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
format_table(table);
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
if has_layout_changes {
|
||||||
|
println!(
|
||||||
|
"Zone redundancy: {}",
|
||||||
|
layout.staging_parameters.get().zone_redundancy
|
||||||
|
);
|
||||||
}
|
}
|
||||||
format_table(table);
|
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
|
|
|
@ -143,9 +143,9 @@ pub struct RemoveRoleOpt {
|
||||||
|
|
||||||
#[derive(StructOpt, Debug)]
|
#[derive(StructOpt, Debug)]
|
||||||
pub struct ConfigLayoutOpt {
|
pub struct ConfigLayoutOpt {
|
||||||
/// Zone redundancy parameter
|
/// Zone redundancy parameter ('none'/'max' or integer)
|
||||||
#[structopt(short = "r", long = "redundancy")]
|
#[structopt(short = "r", long = "redundancy")]
|
||||||
pub(crate) redundancy: Option<usize>,
|
pub(crate) redundancy: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(StructOpt, Debug)]
|
#[derive(StructOpt, Debug)]
|
||||||
|
|
|
@ -14,6 +14,7 @@ path = "lib.rs"
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
format_table.workspace = true
|
||||||
garage_db.workspace = true
|
garage_db.workspace = true
|
||||||
garage_util.workspace = true
|
garage_util.workspace = true
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
//! This module deals with graph algorithms.
|
//! This module deals with graph algorithms.
|
||||||
//! It is used in layout.rs to build the partition to node assignment.
|
//! It is used in layout.rs to build the partition to node assignment.
|
||||||
|
|
||||||
use rand::prelude::SliceRandom;
|
use rand::prelude::{SeedableRng, SliceRandom};
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
|
@ -143,7 +143,11 @@ impl Graph<FlowEdge> {
|
||||||
/// This function shuffles the order of the edge lists. It keeps the ids of the
|
/// This function shuffles the order of the edge lists. It keeps the ids of the
|
||||||
/// reversed edges consistent.
|
/// reversed edges consistent.
|
||||||
fn shuffle_edges(&mut self) {
|
fn shuffle_edges(&mut self) {
|
||||||
let mut rng = rand::thread_rng();
|
// We use deterministic randomness so that the layout calculation algorihtm
|
||||||
|
// will output the same thing every time it is run. This way, the results
|
||||||
|
// pre-calculated in `garage layout show` will match exactly those used
|
||||||
|
// in practice with `garage layout apply`
|
||||||
|
let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]);
|
||||||
for i in 0..self.graph.len() {
|
for i in 0..self.graph.len() {
|
||||||
self.graph[i].shuffle(&mut rng);
|
self.graph[i].shuffle(&mut rng);
|
||||||
// We need to update the ids of the reverse edges.
|
// We need to update the ids of the reverse edges.
|
||||||
|
@ -189,7 +193,7 @@ impl Graph<FlowEdge> {
|
||||||
let mut fifo = VecDeque::new();
|
let mut fifo = VecDeque::new();
|
||||||
fifo.push_back((idsource, 0));
|
fifo.push_back((idsource, 0));
|
||||||
while let Some((id, lvl)) = fifo.pop_front() {
|
while let Some((id, lvl)) = fifo.pop_front() {
|
||||||
if level[id] == None {
|
if level[id].is_none() {
|
||||||
// it means id has not yet been reached
|
// it means id has not yet been reached
|
||||||
level[id] = Some(lvl);
|
level[id] = Some(lvl);
|
||||||
for edge in self.graph[id].iter() {
|
for edge in self.graph[id].iter() {
|
||||||
|
@ -199,7 +203,7 @@ impl Graph<FlowEdge> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if level[idsink] == None {
|
if level[idsink].is_none() {
|
||||||
// There is no residual flow
|
// There is no residual flow
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -383,7 +387,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
|
||||||
for t in 0..forest.len() {
|
for t in 0..forest.len() {
|
||||||
let mut id = t;
|
let mut id = t;
|
||||||
// while we are on a valid undiscovered node
|
// while we are on a valid undiscovered node
|
||||||
while time_of_discovery[id] == None {
|
while time_of_discovery[id].is_none() {
|
||||||
time_of_discovery[id] = Some(t);
|
time_of_discovery[id] = Some(t);
|
||||||
if let Some(i) = forest[id] {
|
if let Some(i) = forest[id] {
|
||||||
id = i;
|
id = i;
|
||||||
|
@ -391,7 +395,7 @@ fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if forest[id] != None && time_of_discovery[id] == Some(t) {
|
if forest[id].is_some() && time_of_discovery[id] == Some(t) {
|
||||||
// We discovered an id that we explored at this iteration t.
|
// We discovered an id that we explored at this iteration t.
|
||||||
// It means we are on a cycle
|
// It means we are on a cycle
|
||||||
let mut cy = vec![id; 1];
|
let mut cy = vec![id; 1];
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
use bytesize::ByteSize;
|
use bytesize::ByteSize;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
@ -115,7 +116,16 @@ mod v09 {
|
||||||
/// algorithm. It is stored as a Crdt.
|
/// algorithm. It is stored as a Crdt.
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||||
pub struct LayoutParameters {
|
pub struct LayoutParameters {
|
||||||
pub zone_redundancy: usize,
|
pub zone_redundancy: ZoneRedundancy,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
|
||||||
|
/// of each partition on at least that number of different zones.
|
||||||
|
/// Otherwise, copies will be stored on the maximum possible number of zones.
|
||||||
|
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||||
|
pub enum ZoneRedundancy {
|
||||||
|
AtLeast(usize),
|
||||||
|
Maximum,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl garage_util::migrate::Migrate for ClusterLayout {
|
impl garage_util::migrate::Migrate for ClusterLayout {
|
||||||
|
@ -125,7 +135,6 @@ mod v09 {
|
||||||
|
|
||||||
fn migrate(previous: Self::Previous) -> Self {
|
fn migrate(previous: Self::Previous) -> Self {
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use std::collections::HashSet;
|
|
||||||
|
|
||||||
// In the old layout, capacities are in an arbitrary unit,
|
// In the old layout, capacities are in an arbitrary unit,
|
||||||
// but in the new layout they are in bytes.
|
// but in the new layout they are in bytes.
|
||||||
|
@ -152,17 +161,10 @@ mod v09 {
|
||||||
.min()
|
.min()
|
||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
|
|
||||||
// Determine zone redundancy parameter
|
// By default, zone_redundancy is maximum possible value
|
||||||
let zone_redundancy = std::cmp::min(
|
let parameters = LayoutParameters {
|
||||||
previous.replication_factor,
|
zone_redundancy: ZoneRedundancy::Maximum,
|
||||||
roles
|
};
|
||||||
.items()
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str()))
|
|
||||||
.collect::<HashSet<&str>>()
|
|
||||||
.len(),
|
|
||||||
);
|
|
||||||
let parameters = LayoutParameters { zone_redundancy };
|
|
||||||
|
|
||||||
let mut res = Self {
|
let mut res = Self {
|
||||||
version: previous.version,
|
version: previous.version,
|
||||||
|
@ -193,7 +195,7 @@ mod v09 {
|
||||||
..
|
..
|
||||||
})) = role
|
})) = role
|
||||||
{
|
{
|
||||||
*cap = *cap * mul;
|
*cap *= mul;
|
||||||
}
|
}
|
||||||
new_roles.merge_raw(node, *ts, &role);
|
new_roles.merge_raw(node, *ts, &role);
|
||||||
}
|
}
|
||||||
|
@ -224,15 +226,39 @@ impl NodeRole {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for ZoneRedundancy {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
ZoneRedundancy::Maximum => write!(f, "maximum"),
|
||||||
|
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl core::str::FromStr for ZoneRedundancy {
|
||||||
|
type Err = &'static str;
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
|
||||||
|
x => {
|
||||||
|
let v = x
|
||||||
|
.parse::<usize>()
|
||||||
|
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
|
||||||
|
Ok(ZoneRedundancy::AtLeast(v))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
|
// Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
|
||||||
impl ClusterLayout {
|
impl ClusterLayout {
|
||||||
pub fn new(replication_factor: usize) -> Self {
|
pub fn new(replication_factor: usize) -> Self {
|
||||||
// We set the default zone redundancy to be equal to the replication factor,
|
// We set the default zone redundancy to be Maximum, meaning that the maximum
|
||||||
// i.e. as strict as possible.
|
// possible value will be used depending on the cluster topology
|
||||||
let parameters = LayoutParameters {
|
let parameters = LayoutParameters {
|
||||||
zone_redundancy: replication_factor,
|
zone_redundancy: ZoneRedundancy::Maximum,
|
||||||
};
|
};
|
||||||
let staging_parameters = Lww::<LayoutParameters>::new(parameters.clone());
|
let staging_parameters = Lww::<LayoutParameters>::new(parameters);
|
||||||
|
|
||||||
let empty_lwwmap = LwwMap::new();
|
let empty_lwwmap = LwwMap::new();
|
||||||
|
|
||||||
|
@ -296,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
|
|
||||||
self.roles.merge(&self.staging_roles);
|
self.roles.merge(&self.staging_roles);
|
||||||
self.roles.retain(|(_, _, v)| v.0.is_some());
|
self.roles.retain(|(_, _, v)| v.0.is_some());
|
||||||
self.parameters = self.staging_parameters.get().clone();
|
self.parameters = *self.staging_parameters.get();
|
||||||
|
|
||||||
self.staging_roles.clear();
|
self.staging_roles.clear();
|
||||||
self.staging_hash = self.calculate_staging_hash();
|
self.staging_hash = self.calculate_staging_hash();
|
||||||
|
@ -325,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
}
|
}
|
||||||
|
|
||||||
self.staging_roles.clear();
|
self.staging_roles.clear();
|
||||||
self.staging_parameters.update(self.parameters.clone());
|
self.staging_parameters.update(self.parameters);
|
||||||
self.staging_hash = self.calculate_staging_hash();
|
self.staging_hash = self.calculate_staging_hash();
|
||||||
|
|
||||||
self.version += 1;
|
self.version += 1;
|
||||||
|
@ -356,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
let mut result = Vec::<Uuid>::new();
|
let mut result = Vec::<Uuid>::new();
|
||||||
for uuid in self.node_id_vec.iter() {
|
for uuid in self.node_id_vec.iter() {
|
||||||
match self.node_role(uuid) {
|
match self.node_role(uuid) {
|
||||||
Some(role) if role.capacity != None => result.push(*uuid),
|
Some(role) if role.capacity.is_some() => result.push(*uuid),
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
Ok(total_capacity)
|
Ok(total_capacity)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the effective value of the zone_redundancy parameter
|
||||||
|
fn effective_zone_redundancy(&self) -> usize {
|
||||||
|
match self.parameters.zone_redundancy {
|
||||||
|
ZoneRedundancy::AtLeast(v) => v,
|
||||||
|
ZoneRedundancy::Maximum => {
|
||||||
|
let n_zones = self
|
||||||
|
.roles
|
||||||
|
.items()
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str()))
|
||||||
|
.collect::<HashSet<&str>>()
|
||||||
|
.len();
|
||||||
|
std::cmp::min(n_zones, self.replication_factor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Check a cluster layout for internal consistency
|
/// Check a cluster layout for internal consistency
|
||||||
/// (assignment, roles, parameters, partition size)
|
/// (assignment, roles, parameters, partition size)
|
||||||
/// returns true if consistent, false if error
|
/// returns true if consistent, false if error
|
||||||
|
@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that every partition is associated to distinct nodes
|
// Check that every partition is associated to distinct nodes
|
||||||
|
let zone_redundancy = self.effective_zone_redundancy();
|
||||||
let rf = self.replication_factor;
|
let rf = self.replication_factor;
|
||||||
for p in 0..(1 << PARTITION_BITS) {
|
for p in 0..(1 << PARTITION_BITS) {
|
||||||
let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec();
|
let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec();
|
||||||
|
@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
.expect("Zone not found.")
|
.expect("Zone not found.")
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
let redundancy = self.parameters.zone_redundancy;
|
if zones_of_p.iter().unique().count() < zone_redundancy {
|
||||||
if zones_of_p.iter().unique().count() < redundancy {
|
|
||||||
return Err(format!(
|
return Err(format!(
|
||||||
"nodes of partition are in less than {} distinct zones",
|
"nodes of partition are in less than {} distinct zones",
|
||||||
redundancy
|
zone_redundancy
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
// algorithm.
|
// algorithm.
|
||||||
let cl2 = self.clone();
|
let cl2 = self.clone();
|
||||||
let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap();
|
let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap();
|
||||||
match cl2.compute_optimal_partition_size(&zone_to_id) {
|
match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) {
|
||||||
Ok(s) if s != self.partition_size => {
|
Ok(s) if s != self.partition_size => {
|
||||||
return Err(format!(
|
return Err(format!(
|
||||||
"partition_size ({}) is different than optimal value ({})",
|
"partition_size ({}) is different than optimal value ({})",
|
||||||
|
@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ====================================================================================
|
||||||
|
|
||||||
// Implementation of the ClusterLayout methods related to the assignment algorithm.
|
// Implementation of the ClusterLayout methods related to the assignment algorithm.
|
||||||
impl ClusterLayout {
|
impl ClusterLayout {
|
||||||
/// This function calculates a new partition-to-node assignment.
|
/// This function calculates a new partition-to-node assignment.
|
||||||
|
@ -549,13 +594,15 @@ impl ClusterLayout {
|
||||||
// changes in the layout. We retrieve the old_assignment reframed with new ids
|
// changes in the layout. We retrieve the old_assignment reframed with new ids
|
||||||
let old_assignment_opt = self.update_node_id_vec()?;
|
let old_assignment_opt = self.update_node_id_vec()?;
|
||||||
|
|
||||||
|
let zone_redundancy = self.effective_zone_redundancy();
|
||||||
|
|
||||||
let mut msg = Message::new();
|
let mut msg = Message::new();
|
||||||
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
|
msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
|
||||||
msg.push("".into());
|
msg.push("".into());
|
||||||
msg.push(format!(
|
msg.push(format!(
|
||||||
"Partitions are \
|
"Partitions are \
|
||||||
replicated {} times on at least {} distinct zones.",
|
replicated {} times on at least {} distinct zones.",
|
||||||
self.replication_factor, self.parameters.zone_redundancy
|
self.replication_factor, zone_redundancy
|
||||||
));
|
));
|
||||||
|
|
||||||
// We generate for once numerical ids for the zones of non gateway nodes,
|
// We generate for once numerical ids for the zones of non gateway nodes,
|
||||||
|
@ -570,12 +617,12 @@ impl ClusterLayout {
|
||||||
nb_nongateway_nodes, self.replication_factor
|
nb_nongateway_nodes, self.replication_factor
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
if id_to_zone.len() < self.parameters.zone_redundancy {
|
if id_to_zone.len() < zone_redundancy {
|
||||||
return Err(Error::Message(format!(
|
return Err(Error::Message(format!(
|
||||||
"The number of zones with non-gateway \
|
"The number of zones with non-gateway \
|
||||||
nodes ({}) is smaller than the redundancy parameter ({})",
|
nodes ({}) is smaller than the redundancy parameter ({})",
|
||||||
id_to_zone.len(),
|
id_to_zone.len(),
|
||||||
self.parameters.zone_redundancy
|
zone_redundancy
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -583,18 +630,18 @@ impl ClusterLayout {
|
||||||
// Capacities should be given in a unit so that partition size is at least 100.
|
// Capacities should be given in a unit so that partition size is at least 100.
|
||||||
// In this case, integer rounding plays a marginal role in the percentages of
|
// In this case, integer rounding plays a marginal role in the percentages of
|
||||||
// optimality.
|
// optimality.
|
||||||
let partition_size = self.compute_optimal_partition_size(&zone_to_id)?;
|
let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?;
|
||||||
|
|
||||||
if old_assignment_opt != None {
|
msg.push("".into());
|
||||||
|
if old_assignment_opt.is_some() {
|
||||||
msg.push(format!(
|
msg.push(format!(
|
||||||
"Optimal size of a partition: {} (was {} in the previous layout).",
|
"Optimal partition size: {} ({} in previous layout)",
|
||||||
ByteSize::b(partition_size).to_string_as(false),
|
ByteSize::b(partition_size).to_string_as(false),
|
||||||
ByteSize::b(self.partition_size).to_string_as(false)
|
ByteSize::b(self.partition_size).to_string_as(false)
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
msg.push(format!(
|
msg.push(format!(
|
||||||
"Given the replication and redundancy constraints, the \
|
"Optimal partition size: {}",
|
||||||
optimal size of a partition is {}.",
|
|
||||||
ByteSize::b(partition_size).to_string_as(false)
|
ByteSize::b(partition_size).to_string_as(false)
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
@ -610,7 +657,8 @@ impl ClusterLayout {
|
||||||
|
|
||||||
// We compute a first flow/assignment that is heuristically close to the previous
|
// We compute a first flow/assignment that is heuristically close to the previous
|
||||||
// assignment
|
// assignment
|
||||||
let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?;
|
let mut gflow =
|
||||||
|
self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?;
|
||||||
if let Some(assoc) = &old_assignment_opt {
|
if let Some(assoc) = &old_assignment_opt {
|
||||||
// We minimize the distance to the previous assignment.
|
// We minimize the distance to the previous assignment.
|
||||||
self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?;
|
self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?;
|
||||||
|
@ -618,7 +666,6 @@ impl ClusterLayout {
|
||||||
|
|
||||||
// We display statistics of the computation
|
// We display statistics of the computation
|
||||||
msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
|
msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
|
||||||
msg.push("".to_string());
|
|
||||||
|
|
||||||
// We update the layout structure
|
// We update the layout structure
|
||||||
self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
|
self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
|
||||||
|
@ -645,7 +692,7 @@ impl ClusterLayout {
|
||||||
.roles
|
.roles
|
||||||
.items()
|
.items()
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None))
|
.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some()))
|
||||||
.map(|(k, _, _)| *k)
|
.map(|(k, _, _)| *k)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
@ -661,7 +708,7 @@ impl ClusterLayout {
|
||||||
.roles
|
.roles
|
||||||
.items()
|
.items()
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None))
|
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none()))
|
||||||
.map(|(k, _, _)| *k)
|
.map(|(k, _, _)| *k)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
@ -723,7 +770,7 @@ impl ClusterLayout {
|
||||||
|
|
||||||
for uuid in self.nongateway_nodes().iter() {
|
for uuid in self.nongateway_nodes().iter() {
|
||||||
let r = self.node_role(uuid).unwrap();
|
let r = self.node_role(uuid).unwrap();
|
||||||
if !zone_to_id.contains_key(&r.zone) && r.capacity != None {
|
if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() {
|
||||||
zone_to_id.insert(r.zone.clone(), id_to_zone.len());
|
zone_to_id.insert(r.zone.clone(), id_to_zone.len());
|
||||||
id_to_zone.push(r.zone.clone());
|
id_to_zone.push(r.zone.clone());
|
||||||
}
|
}
|
||||||
|
@ -736,9 +783,10 @@ impl ClusterLayout {
|
||||||
fn compute_optimal_partition_size(
|
fn compute_optimal_partition_size(
|
||||||
&self,
|
&self,
|
||||||
zone_to_id: &HashMap<String, usize>,
|
zone_to_id: &HashMap<String, usize>,
|
||||||
|
zone_redundancy: usize,
|
||||||
) -> Result<u64, Error> {
|
) -> Result<u64, Error> {
|
||||||
let empty_set = HashSet::<(usize, usize)>::new();
|
let empty_set = HashSet::<(usize, usize)>::new();
|
||||||
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?;
|
let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?;
|
||||||
g.compute_maximal_flow()?;
|
g.compute_maximal_flow()?;
|
||||||
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
|
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
|
||||||
return Err(Error::Message(
|
return Err(Error::Message(
|
||||||
|
@ -751,7 +799,12 @@ impl ClusterLayout {
|
||||||
let mut s_down = 1;
|
let mut s_down = 1;
|
||||||
let mut s_up = self.get_total_capacity()?;
|
let mut s_up = self.get_total_capacity()?;
|
||||||
while s_down + 1 < s_up {
|
while s_down + 1 < s_up {
|
||||||
g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?;
|
g = self.generate_flow_graph(
|
||||||
|
(s_down + s_up) / 2,
|
||||||
|
zone_to_id,
|
||||||
|
&empty_set,
|
||||||
|
zone_redundancy,
|
||||||
|
)?;
|
||||||
g.compute_maximal_flow()?;
|
g.compute_maximal_flow()?;
|
||||||
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
|
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
|
||||||
s_up = (s_down + s_up) / 2;
|
s_up = (s_down + s_up) / 2;
|
||||||
|
@ -790,18 +843,18 @@ impl ClusterLayout {
|
||||||
partition_size: u64,
|
partition_size: u64,
|
||||||
zone_to_id: &HashMap<String, usize>,
|
zone_to_id: &HashMap<String, usize>,
|
||||||
exclude_assoc: &HashSet<(usize, usize)>,
|
exclude_assoc: &HashSet<(usize, usize)>,
|
||||||
|
zone_redundancy: usize,
|
||||||
) -> Result<Graph<FlowEdge>, Error> {
|
) -> Result<Graph<FlowEdge>, Error> {
|
||||||
let vertices =
|
let vertices =
|
||||||
ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
|
ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
|
||||||
let mut g = Graph::<FlowEdge>::new(&vertices);
|
let mut g = Graph::<FlowEdge>::new(&vertices);
|
||||||
let nb_zones = zone_to_id.len();
|
let nb_zones = zone_to_id.len();
|
||||||
let redundancy = self.parameters.zone_redundancy;
|
|
||||||
for p in 0..NB_PARTITIONS {
|
for p in 0..NB_PARTITIONS {
|
||||||
g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?;
|
g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?;
|
||||||
g.add_edge(
|
g.add_edge(
|
||||||
Vertex::Source,
|
Vertex::Source,
|
||||||
Vertex::Pdown(p),
|
Vertex::Pdown(p),
|
||||||
(self.replication_factor - redundancy) as u64,
|
(self.replication_factor - zone_redundancy) as u64,
|
||||||
)?;
|
)?;
|
||||||
for z in 0..nb_zones {
|
for z in 0..nb_zones {
|
||||||
g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?;
|
g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?;
|
||||||
|
@ -830,6 +883,7 @@ impl ClusterLayout {
|
||||||
&self,
|
&self,
|
||||||
zone_to_id: &HashMap<String, usize>,
|
zone_to_id: &HashMap<String, usize>,
|
||||||
prev_assign_opt: &Option<Vec<Vec<usize>>>,
|
prev_assign_opt: &Option<Vec<Vec<usize>>>,
|
||||||
|
zone_redundancy: usize,
|
||||||
) -> Result<Graph<FlowEdge>, Error> {
|
) -> Result<Graph<FlowEdge>, Error> {
|
||||||
// We list the (partition,node) associations that are not used in the
|
// We list the (partition,node) associations that are not used in the
|
||||||
// previous assignment
|
// previous assignment
|
||||||
|
@ -847,7 +901,12 @@ impl ClusterLayout {
|
||||||
}
|
}
|
||||||
|
|
||||||
// We compute the best flow using only the edges used in the previous assignment
|
// We compute the best flow using only the edges used in the previous assignment
|
||||||
let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?;
|
let mut g = self.generate_flow_graph(
|
||||||
|
self.partition_size,
|
||||||
|
zone_to_id,
|
||||||
|
&exclude_edge,
|
||||||
|
zone_redundancy,
|
||||||
|
)?;
|
||||||
g.compute_maximal_flow()?;
|
g.compute_maximal_flow()?;
|
||||||
|
|
||||||
// We add the excluded edges and compute the maximal flow with the full graph.
|
// We add the excluded edges and compute the maximal flow with the full graph.
|
||||||
|
@ -931,29 +990,33 @@ impl ClusterLayout {
|
||||||
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
|
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
|
||||||
let total_cap = self.get_total_capacity()?;
|
let total_cap = self.get_total_capacity()?;
|
||||||
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
|
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
|
||||||
msg.push("".into());
|
|
||||||
msg.push(format!(
|
msg.push(format!(
|
||||||
"Usable capacity / Total cluster capacity: {} / {} ({:.1} %)",
|
"Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
|
||||||
ByteSize::b(used_cap).to_string_as(false),
|
ByteSize::b(used_cap).to_string_as(false),
|
||||||
ByteSize::b(total_cap).to_string_as(false),
|
ByteSize::b(total_cap).to_string_as(false),
|
||||||
percent_cap
|
percent_cap
|
||||||
));
|
));
|
||||||
msg.push("".into());
|
|
||||||
msg.push(
|
|
||||||
"If the percentage is too low, it might be that the \
|
|
||||||
replication/redundancy constraints force the use of nodes/zones with small \
|
|
||||||
storage capacities. \
|
|
||||||
You might want to rebalance the storage capacities or relax the constraints. \
|
|
||||||
See the detailed statistics below and look for saturated nodes/zones."
|
|
||||||
.into(),
|
|
||||||
);
|
|
||||||
msg.push(format!(
|
msg.push(format!(
|
||||||
"Recall that because of the replication factor, the actual available \
|
"Effective capacity (replication factor {}): {}",
|
||||||
storage capacity is {} / {} = {}.",
|
|
||||||
ByteSize::b(used_cap).to_string_as(false),
|
|
||||||
self.replication_factor,
|
self.replication_factor,
|
||||||
ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false)
|
ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false)
|
||||||
));
|
));
|
||||||
|
if percent_cap < 80. {
|
||||||
|
msg.push("".into());
|
||||||
|
msg.push(
|
||||||
|
"If the percentage is too low, it might be that the \
|
||||||
|
cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
|
||||||
|
storage capacities."
|
||||||
|
.into(),
|
||||||
|
);
|
||||||
|
msg.push(
|
||||||
|
"You might want to move storage capacity between zones or relax the redundancy constraint."
|
||||||
|
.into(),
|
||||||
|
);
|
||||||
|
msg.push(
|
||||||
|
"See the detailed statistics below and look for saturated nodes/zones.".into(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// We define and fill in the following tables
|
// We define and fill in the following tables
|
||||||
let storing_nodes = self.nongateway_nodes();
|
let storing_nodes = self.nongateway_nodes();
|
||||||
|
@ -992,25 +1055,25 @@ impl ClusterLayout {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if *prev_assign_opt == None {
|
if prev_assign_opt.is_none() {
|
||||||
new_partitions = stored_partitions.clone();
|
new_partitions = stored_partitions.clone();
|
||||||
new_partitions_zone = stored_partitions_zone.clone();
|
//new_partitions_zone = stored_partitions_zone.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
// We display the statistics
|
// We display the statistics
|
||||||
|
|
||||||
msg.push("".into());
|
msg.push("".into());
|
||||||
if *prev_assign_opt != None {
|
if prev_assign_opt.is_some() {
|
||||||
let total_new_partitions: usize = new_partitions.iter().sum();
|
let total_new_partitions: usize = new_partitions.iter().sum();
|
||||||
msg.push(format!(
|
msg.push(format!(
|
||||||
"A total of {} new copies of partitions need to be \
|
"A total of {} new copies of partitions need to be \
|
||||||
transferred.",
|
transferred.",
|
||||||
total_new_partitions
|
total_new_partitions
|
||||||
));
|
));
|
||||||
|
msg.push("".into());
|
||||||
}
|
}
|
||||||
msg.push("".into());
|
|
||||||
msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into());
|
|
||||||
|
|
||||||
|
let mut table = vec![];
|
||||||
for z in 0..id_to_zone.len() {
|
for z in 0..id_to_zone.len() {
|
||||||
let mut nodes_of_z = Vec::<usize>::new();
|
let mut nodes_of_z = Vec::<usize>::new();
|
||||||
for n in 0..storing_nodes.len() {
|
for n in 0..storing_nodes.len() {
|
||||||
|
@ -1020,15 +1083,9 @@ impl ClusterLayout {
|
||||||
}
|
}
|
||||||
let replicated_partitions: usize =
|
let replicated_partitions: usize =
|
||||||
nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
|
nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
|
||||||
msg.push("".into());
|
table.push(format!(
|
||||||
|
"{}\tTags\tPartitions\tCapacity\tUsable capacity",
|
||||||
msg.push(format!(
|
id_to_zone[z]
|
||||||
"Zone {}: {} distinct partitions stored ({} new, \
|
|
||||||
{} partition copies) ",
|
|
||||||
id_to_zone[z],
|
|
||||||
stored_partitions_zone[z],
|
|
||||||
new_partitions_zone[z],
|
|
||||||
replicated_partitions
|
|
||||||
));
|
));
|
||||||
|
|
||||||
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
|
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
|
||||||
|
@ -1037,33 +1094,35 @@ impl ClusterLayout {
|
||||||
total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
|
total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
|
||||||
}
|
}
|
||||||
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
|
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
|
||||||
msg.push(format!(
|
|
||||||
" Usable capacity / Total capacity: {} / {} ({:.1}%).",
|
|
||||||
ByteSize::b(available_cap_z).to_string_as(false),
|
|
||||||
ByteSize::b(total_cap_z).to_string_as(false),
|
|
||||||
percent_cap_z
|
|
||||||
));
|
|
||||||
|
|
||||||
for n in nodes_of_z.iter() {
|
for n in nodes_of_z.iter() {
|
||||||
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
|
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
|
||||||
let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
|
let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
|
||||||
let tags_n = (self
|
let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
|
||||||
.node_role(&self.node_id_vec[*n])
|
table.push(format!(
|
||||||
.ok_or("Node not found."))?
|
" {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
|
||||||
.tags_string();
|
|
||||||
msg.push(format!(
|
|
||||||
" Node {:?}: {} partitions ({} new) ; \
|
|
||||||
usable/total capacity: {} / {} ({:.1}%) ; tags:{}",
|
|
||||||
self.node_id_vec[*n],
|
self.node_id_vec[*n],
|
||||||
|
tags_n,
|
||||||
stored_partitions[*n],
|
stored_partitions[*n],
|
||||||
new_partitions[*n],
|
new_partitions[*n],
|
||||||
ByteSize::b(available_cap_n).to_string_as(false),
|
|
||||||
ByteSize::b(total_cap_n).to_string_as(false),
|
ByteSize::b(total_cap_n).to_string_as(false),
|
||||||
|
ByteSize::b(available_cap_n).to_string_as(false),
|
||||||
(available_cap_n as f32) / (total_cap_n as f32) * 100.0,
|
(available_cap_n as f32) / (total_cap_n as f32) * 100.0,
|
||||||
tags_n
|
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
table.push(format!(
|
||||||
|
" TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)",
|
||||||
|
replicated_partitions,
|
||||||
|
stored_partitions_zone[z],
|
||||||
|
//new_partitions_zone[z],
|
||||||
|
ByteSize::b(total_cap_z).to_string_as(false),
|
||||||
|
ByteSize::b(available_cap_z).to_string_as(false),
|
||||||
|
percent_cap_z
|
||||||
|
));
|
||||||
|
table.push("".into());
|
||||||
}
|
}
|
||||||
|
msg.push(format_table::format_table_to_string(table));
|
||||||
|
|
||||||
Ok(msg)
|
Ok(msg)
|
||||||
}
|
}
|
||||||
|
@ -1125,7 +1184,7 @@ mod tests {
|
||||||
|
|
||||||
let mut curr_zone = 0;
|
let mut curr_zone = 0;
|
||||||
|
|
||||||
let redundancy = cl.parameters.zone_redundancy;
|
let redundancy = cl.effective_zone_redundancy();
|
||||||
|
|
||||||
for replic in 0..cl.replication_factor {
|
for replic in 0..cl.replication_factor {
|
||||||
for p in 0..NB_PARTITIONS {
|
for p in 0..NB_PARTITIONS {
|
||||||
|
@ -1177,8 +1236,9 @@ mod tests {
|
||||||
);
|
);
|
||||||
cl.staging_roles.merge(&update);
|
cl.staging_roles.merge(&update);
|
||||||
}
|
}
|
||||||
cl.staging_parameters
|
cl.staging_parameters.update(LayoutParameters {
|
||||||
.update(LayoutParameters { zone_redundancy });
|
zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
|
||||||
|
});
|
||||||
cl.staging_hash = cl.calculate_staging_hash();
|
cl.staging_hash = cl.calculate_staging_hash();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -668,7 +668,7 @@ impl System {
|
||||||
|
|
||||||
let prev_layout_check = layout.check().is_ok();
|
let prev_layout_check = layout.check().is_ok();
|
||||||
if layout.merge(adv) {
|
if layout.merge(adv) {
|
||||||
if prev_layout_check && !layout.check().is_ok() {
|
if prev_layout_check && layout.check().is_err() {
|
||||||
error!("New cluster layout is invalid, discarding.");
|
error!("New cluster layout is invalid, discarding.");
|
||||||
return Err(Error::Message(
|
return Err(Error::Message(
|
||||||
"New cluster layout is invalid, discarding.".into(),
|
"New cluster layout is invalid, discarding.".into(),
|
||||||
|
@ -724,7 +724,7 @@ impl System {
|
||||||
|
|
||||||
async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
|
async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
|
||||||
while !*stop_signal.borrow() {
|
while !*stop_signal.borrow() {
|
||||||
let not_configured = !self.ring.borrow().layout.check().is_ok();
|
let not_configured = self.ring.borrow().layout.check().is_err();
|
||||||
let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
|
let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
|
||||||
let expected_n_nodes = self.ring.borrow().layout.num_nodes();
|
let expected_n_nodes = self.ring.borrow().layout.num_nodes();
|
||||||
let bad_peers = self
|
let bad_peers = self
|
||||||
|
|
Loading…
Reference in a new issue