From c94406f4282d48e2e2ac82ffb57eafaad23f7edc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 9 Nov 2021 12:24:04 +0100 Subject: [PATCH] Improve how node roles are assigned in Garage - change the terminology: the network configuration becomes the role table, the configuration of a nodes becomes a node's role - the modification of the role table takes place in two steps: first, changes are staged in a CRDT data structure. Then, once the user is happy with the changes, they can commit them all at once (or revert them). - update documentation - fix tests - implement smarter partition assignation algorithm This patch breaks the format of the network configuration: when migrating, the cluster will be in a state where no roles are assigned. All roles must be re-assigned and commited at once. This migration should not pose an issue. --- Cargo.lock | 15 +- Cargo.nix | 83 +-- README.md | 12 +- doc/book/src/SUMMARY.md | 3 +- doc/book/src/cookbook/gateways.md | 10 +- doc/book/src/cookbook/real_world.md | 58 +- doc/book/src/cookbook/recovering.md | 18 +- doc/book/src/intro.md | 39 +- doc/book/src/quick_start/index.md | 27 +- .../src/reference_manual/configuration.md | 4 +- doc/book/src/reference_manual/layout.md | 74 +++ .../src/working_documents/load_balancing.md | 2 + script/dev-cluster.sh | 2 +- script/dev-configure.sh | 3 +- script/test-smoke.sh | 6 +- src/api/Cargo.toml | 9 +- src/garage/Cargo.toml | 15 +- src/garage/admin.rs | 4 +- src/garage/cli/cmd.rs | 214 ++----- src/garage/cli/layout.rs | 340 ++++++++++ src/garage/cli/mod.rs | 2 + src/garage/cli/structs.rs | 78 ++- src/garage/main.rs | 4 +- src/model/Cargo.toml | 9 +- src/rpc/Cargo.toml | 6 +- src/rpc/layout.rs | 579 ++++++++++++++++++ src/rpc/lib.rs | 1 + src/rpc/ring.rs | 197 +----- src/rpc/rpc_helper.rs | 4 +- src/rpc/system.rs | 100 +-- src/table/Cargo.toml | 7 +- src/table/lib.rs | 5 +- src/table/replication/fullcopy.rs | 4 +- src/util/Cargo.toml | 3 +- src/{table => util}/crdt/bool.rs | 0 src/{table => util}/crdt/crdt.rs | 2 +- src/{table => util}/crdt/lww.rs | 18 +- src/{table => util}/crdt/lww_map.rs | 18 +- src/{table => util}/crdt/map.rs | 0 src/{table => util}/crdt/mod.rs | 0 src/util/lib.rs | 1 + src/web/Cargo.toml | 11 +- 42 files changed, 1430 insertions(+), 557 deletions(-) create mode 100644 doc/book/src/reference_manual/layout.md create mode 100644 src/garage/cli/layout.rs create mode 100644 src/rpc/layout.rs rename src/{table => util}/crdt/bool.rs (100%) rename src/{table => util}/crdt/crdt.rs (98%) rename src/{table => util}/crdt/lww.rs (94%) rename src/{table => util}/crdt/lww_map.rs (95%) rename src/{table => util}/crdt/map.rs (100%) rename src/{table => util}/crdt/mod.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 58a28ab3..8f81c098 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -379,7 +379,7 @@ dependencies = [ [[package]] name = "garage" -version = "0.4.0" +version = "0.5.0" dependencies = [ "async-trait", "bytes 1.1.0", @@ -408,7 +408,7 @@ dependencies = [ [[package]] name = "garage_api" -version = "0.4.0" +version = "0.5.0" dependencies = [ "base64", "bytes 1.1.0", @@ -440,7 +440,7 @@ dependencies = [ [[package]] name = "garage_model" -version = "0.4.0" +version = "0.5.0" dependencies = [ "arc-swap", "async-trait", @@ -462,7 +462,7 @@ dependencies = [ [[package]] name = "garage_rpc" -version = "0.4.0" +version = "0.5.0" dependencies = [ "arc-swap", "async-trait", @@ -479,6 +479,7 @@ dependencies = [ "rand", "rmp-serde 0.15.5", "serde", + "serde_bytes", "serde_json", "tokio", "tokio-stream", @@ -486,7 +487,7 @@ dependencies = [ [[package]] name = "garage_table" -version = "0.4.0" +version = "0.5.0" dependencies = [ "async-trait", "bytes 1.1.0", @@ -506,7 +507,7 @@ dependencies = [ [[package]] name = "garage_util" -version = "0.4.0" +version = "0.5.0" dependencies = [ "blake2", "chrono", @@ -530,7 +531,7 @@ dependencies = [ [[package]] name = "garage_web" -version = "0.4.0" +version = "0.5.0" dependencies = [ "err-derive 0.3.0", "futures", diff --git a/Cargo.nix b/Cargo.nix index 5307d1e2..2e0804d5 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -40,13 +40,13 @@ in { cargo2nixVersion = "0.9.0"; workspace = { - garage_util = rustPackages.unknown.garage_util."0.4.0"; - garage_rpc = rustPackages.unknown.garage_rpc."0.4.0"; - garage_table = rustPackages.unknown.garage_table."0.4.0"; - garage_model = rustPackages.unknown.garage_model."0.4.0"; - garage_api = rustPackages.unknown.garage_api."0.4.0"; - garage_web = rustPackages.unknown.garage_web."0.4.0"; - garage = rustPackages.unknown.garage."0.4.0"; + garage_util = rustPackages.unknown.garage_util."0.5.0"; + garage_rpc = rustPackages.unknown.garage_rpc."0.5.0"; + garage_table = rustPackages.unknown.garage_table."0.5.0"; + garage_model = rustPackages.unknown.garage_model."0.5.0"; + garage_api = rustPackages.unknown.garage_api."0.5.0"; + garage_web = rustPackages.unknown.garage_web."0.5.0"; + garage = rustPackages.unknown.garage."0.5.0"; }; "registry+https://github.com/rust-lang/crates.io-index".aho-corasick."0.7.18" = overridableMkRustCrate (profileName: rec { name = "aho-corasick"; @@ -246,7 +246,7 @@ in registry = "registry+https://github.com/rust-lang/crates.io-index"; src = fetchCratesIo { inherit name version; sha256 = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469"; }; dependencies = { - ${ if hostPlatform.config == "aarch64-apple-darwin" || hostPlatform.parsed.cpu.name == "aarch64" && hostPlatform.parsed.kernel.name == "linux" then "libc" else null } = rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.103" { inherit profileName; }; + ${ if hostPlatform.parsed.cpu.name == "aarch64" && hostPlatform.parsed.kernel.name == "linux" || hostPlatform.config == "aarch64-apple-darwin" then "libc" else null } = rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.103" { inherit profileName; }; }; }); @@ -606,9 +606,9 @@ in }; }); - "unknown".garage."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/garage"); dependencies = { @@ -616,12 +616,12 @@ in bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.1.0" { inherit profileName; }; futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; }; futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; }; - garage_api = rustPackages."unknown".garage_api."0.4.0" { inherit profileName; }; - garage_model = rustPackages."unknown".garage_model."0.4.0" { inherit profileName; }; - garage_rpc = rustPackages."unknown".garage_rpc."0.4.0" { inherit profileName; }; - garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; }; - garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; }; - garage_web = rustPackages."unknown".garage_web."0.4.0" { inherit profileName; }; + garage_api = rustPackages."unknown".garage_api."0.5.0" { inherit profileName; }; + garage_model = rustPackages."unknown".garage_model."0.5.0" { inherit profileName; }; + garage_rpc = rustPackages."unknown".garage_rpc."0.5.0" { inherit profileName; }; + garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; }; + garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; }; + garage_web = rustPackages."unknown".garage_web."0.5.0" { inherit profileName; }; git_version = rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.5" { inherit profileName; }; hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }; sodiumoxide = rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; }; @@ -638,9 +638,9 @@ in }; }); - "unknown".garage_api."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_api."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage_api"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/api"); dependencies = { @@ -651,9 +651,9 @@ in err_derive = buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.0" { profileName = "__noProfile"; }; futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; }; futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; }; - garage_model = rustPackages."unknown".garage_model."0.4.0" { inherit profileName; }; - garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; }; - garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; }; + garage_model = rustPackages."unknown".garage_model."0.5.0" { inherit profileName; }; + garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; }; + garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; }; hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }; hmac = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.10.1" { inherit profileName; }; http = rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.5" { inherit profileName; }; @@ -673,9 +673,9 @@ in }; }); - "unknown".garage_model."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_model."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage_model"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/model"); dependencies = { @@ -683,9 +683,9 @@ in async_trait = buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.51" { profileName = "__noProfile"; }; futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; }; futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; }; - garage_rpc = rustPackages."unknown".garage_rpc."0.4.0" { inherit profileName; }; - garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; }; - garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; }; + garage_rpc = rustPackages."unknown".garage_rpc."0.5.0" { inherit profileName; }; + garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; }; + garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; }; hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }; log = rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.14" { inherit profileName; }; netapp = rustPackages."registry+https://github.com/rust-lang/crates.io-index".netapp."0.3.0" { inherit profileName; }; @@ -698,9 +698,9 @@ in }; }); - "unknown".garage_rpc."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_rpc."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage_rpc"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/rpc"); dependencies = { @@ -709,7 +709,7 @@ in bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.1.0" { inherit profileName; }; futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; }; futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; }; - garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; }; + garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; }; gethostname = rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.2.1" { inherit profileName; }; hex = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }; hyper = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."0.14.13" { inherit profileName; }; @@ -719,15 +719,16 @@ in rand = rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.4" { inherit profileName; }; rmp_serde = rustPackages."registry+https://github.com/rust-lang/crates.io-index".rmp-serde."0.15.5" { inherit profileName; }; serde = rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.130" { inherit profileName; }; + serde_bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.5" { inherit profileName; }; serde_json = rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_json."1.0.68" { inherit profileName; }; tokio = rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.12.0" { inherit profileName; }; tokio_stream = rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.7" { inherit profileName; }; }; }); - "unknown".garage_table."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_table."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage_table"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/table"); dependencies = { @@ -735,8 +736,8 @@ in bytes = rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.1.0" { inherit profileName; }; futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; }; futures_util = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.17" { inherit profileName; }; - garage_rpc = rustPackages."unknown".garage_rpc."0.4.0" { inherit profileName; }; - garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; }; + garage_rpc = rustPackages."unknown".garage_rpc."0.5.0" { inherit profileName; }; + garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; }; hexdump = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }; log = rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.14" { inherit profileName; }; rand = rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.4" { inherit profileName; }; @@ -748,9 +749,9 @@ in }; }); - "unknown".garage_util."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_util."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage_util"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/util"); dependencies = { @@ -775,18 +776,18 @@ in }; }); - "unknown".garage_web."0.4.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_web."0.5.0" = overridableMkRustCrate (profileName: rec { name = "garage_web"; - version = "0.4.0"; + version = "0.5.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/web"); dependencies = { err_derive = buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.0" { profileName = "__noProfile"; }; futures = rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.17" { inherit profileName; }; - garage_api = rustPackages."unknown".garage_api."0.4.0" { inherit profileName; }; - garage_model = rustPackages."unknown".garage_model."0.4.0" { inherit profileName; }; - garage_table = rustPackages."unknown".garage_table."0.4.0" { inherit profileName; }; - garage_util = rustPackages."unknown".garage_util."0.4.0" { inherit profileName; }; + garage_api = rustPackages."unknown".garage_api."0.5.0" { inherit profileName; }; + garage_model = rustPackages."unknown".garage_model."0.5.0" { inherit profileName; }; + garage_table = rustPackages."unknown".garage_table."0.5.0" { inherit profileName; }; + garage_util = rustPackages."unknown".garage_util."0.5.0" { inherit profileName; }; http = rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.5" { inherit profileName; }; hyper = rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."0.14.13" { inherit profileName; }; log = rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.14" { inherit profileName; }; diff --git a/README.md b/README.md index febb0b5b..ec7e2485 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,18 @@ Garage [![Build Status](https://drone.deuxfleurs.fr/api/badges/Deuxfleurs/garage

- Garage logo + Garage logo

+

+ [ Website and documentation + | Binary releases + | Git repository + | Matrix channel + ] +

+ Garage is a lightweight S3-compatible distributed object store, with the following goals: - As self-contained as possible @@ -22,5 +30,3 @@ Non-goals include: - Erasure coding (our replication model is simply to copy the data as is on several nodes, in different datacenters if possible) Our main use case is to provide a distributed storage layer for small-scale self hosted services such as [Deuxfleurs](https://deuxfleurs.fr). - -**[Go to the documentation](https://garagehq.deuxfleurs.fr)** diff --git a/doc/book/src/SUMMARY.md b/doc/book/src/SUMMARY.md index 1f597954..cbf6bb70 100644 --- a/doc/book/src/SUMMARY.md +++ b/doc/book/src/SUMMARY.md @@ -5,12 +5,12 @@ - [Quick start](./quick_start/index.md) - [Cookbook](./cookbook/index.md) + - [Multi-node deployment](./cookbook/real_world.md) - [Building from source](./cookbook/from_source.md) - [Integration with systemd](./cookbook/systemd.md) - [Gateways](./cookbook/gateways.md) - [Exposing buckets as websites](./cookbook/exposing_websites.md) - [Configuring a reverse proxy](./cookbook/reverse_proxy.md) - - [Production Deployment](./cookbook/real_world.md) - [Recovering from failures](./cookbook/recovering.md) - [Integrations](./connect/index.md) @@ -25,6 +25,7 @@ - [Reference Manual](./reference_manual/index.md) - [Garage configuration file](./reference_manual/configuration.md) + - [Cluster layout management](./reference_manual/layout.md) - [Garage CLI](./reference_manual/cli.md) - [S3 compatibility status](./reference_manual/s3_compatibility.md) diff --git a/doc/book/src/cookbook/gateways.md b/doc/book/src/cookbook/gateways.md index f1ad43e4..7b286b65 100644 --- a/doc/book/src/cookbook/gateways.md +++ b/doc/book/src/cookbook/gateways.md @@ -21,7 +21,9 @@ Currently it will not work with minio client. Follow issue [#64](https://git.deu The instructions are similar to a regular node, the only option that is different is while configuring the node, you must set the `--gateway` parameter: ```bash -garage node configure --gateway --tag gw1 xxxx +garage layout assign --gateway --tag gw1 +garage layout show # review the changes you are making +garage layout apply # once satisfied, apply the changes ``` Then use `http://localhost:3900` when a S3 endpoint is required: @@ -29,3 +31,9 @@ Then use `http://localhost:3900` when a S3 endpoint is required: ```bash aws --endpoint-url http://127.0.0.1:3900 s3 ls ``` + +If a newly added gateway node seems to not be working, do a full table resync to ensure that bucket and key list are correctly propagated: + +```bash +garage repair -a --yes tables +``` diff --git a/doc/book/src/cookbook/real_world.md b/doc/book/src/cookbook/real_world.md index 7864274c..4b3fec2b 100644 --- a/doc/book/src/cookbook/real_world.md +++ b/doc/book/src/cookbook/real_world.md @@ -41,15 +41,15 @@ For our example, we will suppose the following infrastructure with IPv6 connecti ## Get a Docker image -Our docker image is currently named `lxpz/garage_amd64` and is stored on the [Docker Hub](https://hub.docker.com/r/lxpz/garage_amd64/tags?page=1&ordering=last_updated). +Our docker image is currently named `dxflrs/amd64_garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/amd64_garage/tags?page=1&ordering=last_updated). We encourage you to use a fixed tag (eg. `v0.4.0`) and not the `latest` tag. For this example, we will use the latest published version at the time of the writing which is `v0.4.0` but it's up to you -to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/lxpz/garage_amd64/tags?page=1&ordering=last_updated). +to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/amd64_garage/tags?page=1&ordering=last_updated). For example: ``` -sudo docker pull lxpz/garage_amd64:v0.4.0 +sudo docker pull dxflrs/amd64_garage:v0.4.0 ``` ## Deploying and configuring Garage @@ -144,7 +144,7 @@ At this point, nodes are not yet talking to one another. Your output should therefore look like follows: ``` -Mercury$ garage node-id +Mercury$ garage status ==== HEALTHY NODES ==== ID Hostname Address Tag Zone Capacity 563e1ac825ee3323… Mercury [fc00:1::1]:3901 NO ROLE ASSIGNED @@ -157,14 +157,14 @@ When your Garage nodes first start, they will generate a local node identifier (based on a public/private key pair). To obtain the node identifier of a node, once it is generated, -run `garage node-id`. +run `garage node id`. This will print keys as follows: ```bash -Mercury$ garage node-id +Mercury$ garage node id 563e1ac825ee3323aa441e72c26d1030d6d4414aeb3dd25287c531e7fc2bc95d@[fc00:1::1]:3901 -Venus$ garage node-id +Venus$ garage node id 86f0f26ae4afbd59aaf9cfb059eefac844951efd5b8caeec0d53f4ed6c85f332@[fc00:1::2]:3901 etc. @@ -191,20 +191,22 @@ ID Hostname Address Tag Zone Capa 212f7572f0c89da9… Mars [fc00:F::1]:3901 NO ROLE ASSIGNED ``` -## Giving roles to nodes +## Creating a cluster layout We will now inform Garage of the disk space available on each node of the cluster as well as the zone (e.g. datacenter) in which each machine is located. +This information is called the **cluster layout** and consists +of a role that is assigned to each active cluster node. For our example, we will suppose we have the following infrastructure (Capacity, Identifier and Zone are specific values to Garage described in the following): | Location | Name | Disk Space | `Capacity` | `Identifier` | `Zone` | |----------|---------|------------|------------|--------------|--------------| -| Paris | Mercury | 1 To | `2` | `563e` | `par1` | -| Paris | Venus | 2 To | `4` | `86f0` | `par1` | -| London | Earth | 2 To | `4` | `6814` | `lon1` | -| Brussels | Mars | 1.5 To | `3` | `212f` | `bru1` | +| Paris | Mercury | 1 To | `10` | `563e` | `par1` | +| Paris | Venus | 2 To | `20` | `86f0` | `par1` | +| London | Earth | 2 To | `20` | `6814` | `lon1` | +| Brussels | Mars | 1.5 To | `15` | `212f` | `bru1` | #### Node identifiers @@ -239,13 +241,9 @@ in order to provide high availability despite failure of a zone. Garage reasons on an abstract metric about disk storage that is named the *capacity* of a node. The capacity configured in Garage must be proportional to the disk space dedicated to the node. -Due to the way the Garage allocation algorithm works, capacity values must -be **integers**, and must be **as small as possible**, for instance with -1 representing the size of your smallest server. -Here we chose that 1 unit of capacity = 0.5 To, so that we can express servers of size -1 To and 2 To, as wel as the intermediate size 1.5 To, with the integer values 2, 4 and -3 respectively (see table above). +Capacity values must be **integers** but can be given any signification. +Here we chose that 1 unit of capacity = 100 GB. Note that the amount of data stored by Garage on each server may not be strictly proportional to its capacity value, as Garage will priorize having 3 copies of data in different zones, @@ -257,13 +255,29 @@ have 66% chance of being stored by Venus and 33% chance of being stored by Mercu Given the information above, we will configure our cluster as follow: +```bash +garage layout assign -z par1 -c 10 -t mercury 563e +garage layout assign -z par1 -c 20 -t venus 86f0 +garage layout assign -z lon1 -c 20 -t earth 6814 +garage layout assign -z bru1 -c 15 -t mars 212f ``` -garage node configure -z par1 -c 2 -t mercury 563e -garage node configure -z par1 -c 4 -t venus 86f0 -garage node configure -z lon1 -c 4 -t earth 6814 -garage node configure -z bru1 -c 3 -t mars 212f + +At this point, the changes in the cluster layout have not yet been applied. +To show the new layout that will be applied, call: + +```bash +garage layout show ``` +Once you are satisfied with your new layout, apply it with: + +```bash +garage layout apply +``` + +**WARNING:** if you want to use the layout modification commands in a script, +make sure to read [this page](/reference_manual/layout.html) first. + ## Using your Garage cluster diff --git a/doc/book/src/cookbook/recovering.md b/doc/book/src/cookbook/recovering.md index a6f15fcb..279d574c 100644 --- a/doc/book/src/cookbook/recovering.md +++ b/doc/book/src/cookbook/recovering.md @@ -28,8 +28,10 @@ and you should instead use one of the methods detailed in the next sections. Removing a node is done with the following command: -``` -garage node remove --yes +```bash +garage layout remove +garage layout show # review the changes you are making +garage layout apply # once satisfied, apply the changes ``` (you can get the `node_id` of the failed node by running `garage status`) @@ -50,7 +52,7 @@ We just need to tell Garage to get back all the data blocks and store them on th First, set up a new HDD to store Garage's data directory on the failed node, and restart Garage using the existing configuration. Then, run: -``` +```bash garage repair -a --yes blocks ``` @@ -58,7 +60,7 @@ This will re-synchronize blocks of data that are missing to the new HDD, reading You can check on the advancement of this process by doing the following command: -``` +```bash garage stats -a ``` @@ -94,9 +96,11 @@ The ID of the lost node should be shown in `garage status` in the section for di Then, replace the broken node by the new one, using: -``` -garage node configure --replace \ - -c -z -t +```bash +garage layout assign --replace \ + -c -z -t +garage layout show # review the changes you are making +garage layout apply # once satisfied, apply the changes ``` Garage will then start synchronizing all required data on the new node. diff --git a/doc/book/src/intro.md b/doc/book/src/intro.md index ffce8847..a54362be 100644 --- a/doc/book/src/intro.md +++ b/doc/book/src/intro.md @@ -18,10 +18,18 @@ This very website is hosted using Garage. In other words: the doc is the PoC! # The Garage Geo-Distributed Data Store -Garage is a lightweight geo-distributed data store. -It comes from the observation that despite numerous object stores -many people have broken data management policies (backup/replication on a single site or none at all). -To promote better data management policies, we focused on the following **desirable properties**: +Garage is a lightweight geo-distributed data store that implements the +[Amazon S3](https://docs.aws.amazon.com/AmazonS3/latest/API/Welcome.html) +object storage protocole. It enables applications to store large blobs such +as pictures, video, images, documents, etc., in a redundant multi-node +setting. S3 is versatile enough to also be used to publish a static +website. + +Garage comes from the observation that despite the numerous existing +implementation of object stores, many people have broken data management +policies (backup/replication on a single site or none at all). To promote +better data management policies, we focused on the following **desirable +properties**: - **Self-contained & lightweight**: works everywhere and integrates well in existing environments to target [hyperconverged infrastructures](https://en.wikipedia.org/wiki/Hyper-converged_infrastructure). - **Highly resilient**: highly resilient to network failures, network latency, disk failures, sysadmin failures. @@ -32,26 +40,19 @@ We also noted that the pursuit of some other goals are detrimental to our initia The following has been identified as **non-goals** (if these points matter to you, you should not use Garage): - **Extreme performances**: high performances constrain a lot the design and the infrastructure; we seek performances through minimalism only. - - **Feature extensiveness**: complete implementation of the S3 API or any other API to make garage a drop-in replacement is not targeted as it could lead to decisions impacting our desirable properties. + - **Feature extensiveness**: complete implementation of the S3 API or any other API to make Garage a drop-in replacement is not targeted as it could lead to decisions impacting our desirable properties. - **Storage optimizations**: erasure coding or any other coding technique both increase the difficulty of placing data and synchronizing; we limit ourselves to duplication. - **POSIX/Filesystem compatibility**: we do not aim at being POSIX compatible or to emulate any kind of filesystem. Indeed, in a distributed environment, such synchronizations are translated in network messages that impose severe constraints on the deployment. -## Supported and planned protocols - -Garage speaks (or will speak) the following protocols: - - - [S3](https://docs.aws.amazon.com/AmazonS3/latest/API/Welcome.html) - *SUPPORTED* - Enable applications to store large blobs such as pictures, video, images, documents, etc. S3 is versatile enough to also be used to publish a static website. - - [IMAP](https://github.com/go-pluto/pluto) - *PLANNED* - email storage is quite complex to get good performances. -To keep performances optimal, most IMAP servers only support on-disk storage. -We plan to add logic to Garage to make it a viable solution for email storage. - - *More to come* - ## Use Cases -**[Deuxfleurs](https://deuxfleurs.fr):** Garage is used by Deuxfleurs which is a non-profit hosting organization. -Especially, it is used to host their main website, this documentation and some of its members' blogs. -Additionally, Garage is used as a [backend for Nextcloud](https://docs.nextcloud.com/server/20/admin_manual/configuration_files/primary_storage.html). -Deuxfleurs also plans to use Garage as their [Matrix's media backend](https://github.com/matrix-org/synapse-s3-storage-provider) and as the backend of [OCIS](https://github.com/owncloud/ocis). +**[Deuxfleurs](https://deuxfleurs.fr):** Garage is used by Deuxfleurs which +is a non-profit hosting organization. Especially, it is used to host their +main website, this documentation and some of its members' blogs. +Deuxfleurs also uses Garage as their [Matrix's media +backend](https://github.com/matrix-org/synapse-s3-storage-provider). +Deuxfleurs also uses it in its continuous integration platform to store +Drone's job logs and a Nix binary cache. *Are you using Garage? [Open a pull request](https://git.deuxfleurs.fr/Deuxfleurs/garage/) to add your organization here!* diff --git a/doc/book/src/quick_start/index.md b/doc/book/src/quick_start/index.md index 8de3fd8b..ffb3ebbe 100644 --- a/doc/book/src/quick_start/index.md +++ b/doc/book/src/quick_start/index.md @@ -6,22 +6,23 @@ and how to interact with it. Our goal is to introduce you to Garage's workflows. Following this guide is recommended before moving on to -[configuring a real-world deployment](../cookbook/real_world.md). +[configuring a multi-node cluster](../cookbook/real_world.md). -Note that this kind of deployment should not be used in production, as it provides -no redundancy for your data! +Note that this kind of deployment should not be used in production, +as it provides no redundancy for your data! ## Get a binary Download the latest Garage binary from the release pages on our repository: - + Place this binary somewhere in your `$PATH` so that you can invoke the `garage` command directly (for instance you can copy the binary in `/usr/local/bin` or in `~/.local/bin`). If a binary of the last version is not available for your architecture, +or if you want a build customized for your system, you can [build Garage from source](../cookbook/from_source.md). @@ -109,9 +110,9 @@ ID Hostname Address Tag Zone Capacit 563e1ac825ee3323… linuxbox 127.0.0.1:3901 NO ROLE ASSIGNED ``` -## Configuring your Garage node +## Creating a cluster layout -Configuring the nodes in a Garage deployment means informing Garage +Creating a cluster layout for a Garage deployment means informing Garage of the disk space available on each node of the cluster as well as the zone (e.g. datacenter) each machine is located in. @@ -119,14 +120,18 @@ For our test deployment, we are using only one node. The way in which we configu it does not matter, you can simply write: ```bash -garage node configure -z dc1 -c 1 +garage layout assign -z dc1 -c 1 ``` where `` corresponds to the identifier of the node shown by `garage status` (first column). You can enter simply a prefix of that identifier. -For instance here you could write just `garage node configure -z dc1 -c 1 563e`. +For instance here you could write just `garage layout assign -z dc1 -c 1 563e`. +The layout then has to be applied to the cluster, using: +```bash +garage layout apply +``` ## Creating buckets and keys @@ -197,7 +202,7 @@ Now that we have a bucket and a key, we need to give permissions to the key on t ``` garage bucket allow \ --read \ - --write + --write \ nextcloud-bucket \ --key nextcloud-app-key ``` @@ -270,5 +275,5 @@ The following tools can also be used to send and recieve files from/to Garage: - [Cyberduck](https://cyberduck.io/) - [`s3cmd`](https://s3tools.org/s3cmd) -Refer to the ["configuring clients"](../cookbook/clients.md) page to learn how to configure -these clients to interact with a Garage server. +Refer to the ["Integrations" section](../connect/index.md) to learn how to +configure application and command line utilities to integrate with Garage. diff --git a/doc/book/src/reference_manual/configuration.md b/doc/book/src/reference_manual/configuration.md index 61f7bcee..0b1e7bc7 100644 --- a/doc/book/src/reference_manual/configuration.md +++ b/doc/book/src/reference_manual/configuration.md @@ -133,9 +133,9 @@ These peer identifiers have the following syntax: In the case where `rpc_public_addr` is correctly specified in the configuration file, the full identifier of a node including IP and port can -be obtained by running `garage node-id` and then included directly in the +be obtained by running `garage node id` and then included directly in the `bootstrap_peers` list of other nodes. Otherwise, only the node's public -key will be returned by `garage node-id` and you will have to add the IP +key will be returned by `garage node id` and you will have to add the IP yourself. #### `consul_host` and `consul_service_name` diff --git a/doc/book/src/reference_manual/layout.md b/doc/book/src/reference_manual/layout.md new file mode 100644 index 00000000..80c71d60 --- /dev/null +++ b/doc/book/src/reference_manual/layout.md @@ -0,0 +1,74 @@ +# Creating and updating a cluster layout + +The cluster layout in Garage is a table that assigns to each node a role in +the cluster. The role of a node in Garage can either be a storage node with +a certain capacity, or a gateway node that does not store data and is only +used as an API entry point for faster cluster access. +An introduction to building cluster layouts can be found in the [production deployment](/cookbook/real_world.md) page. + +## How cluster layouts work in Garage + +In Garage, a cluster layout is composed of the following components: + +- a table of roles assigned to nodes +- a version number + +Garage nodes will always use the cluster layout with the highest version number. + +Garage nodes also maintain and synchronize between them a set of proposed role +changes that haven't yet been applied. These changes will be applied (or +canceled) in the next version of the layout + +The following commands insert modifications to the set of proposed role changes +for the next layout version (but they do not create the new layout immediately): + +```bash +garage layout assign [...] +garage layout remove [...] +``` + +The following command can be used to inspect the layout that is currently set in the cluster +and the changes proposed for the next layout version, if any: + +```bash +garage layout show +``` + +The following commands create a new layout with the specified version number, +that either takes into account the proposed changes or cancels them: + +```bash +garage layout apply --version +garage layout revert --version +``` + +The version number of the new layout to create must be 1 + the version number +of the previous layout that existed in the cluster. The `apply` and `revert` +commands will fail otherwise. + +## Warnings about Garage cluster layout management + +**Warning: never make several calls to `garage layout apply` or `garage layout +revert` with the same value of the `--version` flag. Doing so can lead to the +creation of several different layouts with the same version number, in which +case your Garage cluster will become inconsistent until fixed.** If a call to +`garage layout apply` or `garage layout revert` has failed and `garage layout +show` indicates that a new layout with the given version number has not been +set in the cluster, then it is fine to call the command again with the same +version number. + +If you are using the `garage` CLI by typing individual commands in your +shell, you shouldn't have much issues as long as you run commands one after +the other and take care of checking the output of `garage layout show` +before applying any changes. + +If you are using the `garage` CLI to script layout changes, follow the following recommendations: + +- Make all of your `garage` CLI calls to the same RPC host. Do not use the + `garage` CLI to connect to individual nodes to send them each a piece of the + layout changes you are making, as the changes propagate asynchronously + between nodes and might not all be taken into account at the time when the + new layout is applied. + +- **Only call `garage layout apply` once**, and call it **strictly after** all + of the `layout assign` and `layout remove` commands have returned. diff --git a/doc/book/src/working_documents/load_balancing.md b/doc/book/src/working_documents/load_balancing.md index c436fdcb..99271add 100644 --- a/doc/book/src/working_documents/load_balancing.md +++ b/doc/book/src/working_documents/load_balancing.md @@ -1,5 +1,7 @@ # Load Balancing Data (planned for version 0.2) +**This is being yet improved in release 0.5. The working document has not been updated yet, it still only applies to Garage 0.2 through 0.4.** + I have conducted a quick study of different methods to load-balance data over different Garage nodes using consistent hashing. ## Requirements diff --git a/script/dev-cluster.sh b/script/dev-cluster.sh index 3ca3f74b..9ac7ff34 100755 --- a/script/dev-cluster.sh +++ b/script/dev-cluster.sh @@ -69,7 +69,7 @@ done sleep 3 # Establish connections between nodes for count in $(seq 1 3); do - NODE=$(garage -c /tmp/config.$count.toml node-id -q) + NODE=$(garage -c /tmp/config.$count.toml node id -q) for count2 in $(seq 1 3); do garage -c /tmp/config.$count2.toml node connect $NODE done diff --git a/script/dev-configure.sh b/script/dev-configure.sh index add9a96a..f4bb338f 100755 --- a/script/dev-configure.sh +++ b/script/dev-configure.sh @@ -25,6 +25,7 @@ garage -c /tmp/config.1.toml status \ | grep 'NO ROLE' \ | grep -Po '^[0-9a-f]+' \ | while read id; do - garage -c /tmp/config.1.toml node configure -z dc1 -c 1 $id + garage -c /tmp/config.1.toml layout assign $id -z dc1 -c 1 done +garage -c /tmp/config.1.toml layout apply --version 1 diff --git a/script/test-smoke.sh b/script/test-smoke.sh index 86bbe3ee..87d99ca8 100755 --- a/script/test-smoke.sh +++ b/script/test-smoke.sh @@ -116,11 +116,11 @@ if [ -z "$SKIP_AWS" ]; then echo "🧪 Website Testing" echo "

hello world

" > /tmp/garage-index.html aws s3 cp /tmp/garage-index.html s3://eprouvette/index.html - [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3923/ ` == 404 ] + [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3921/ ` == 404 ] garage -c /tmp/config.1.toml bucket website --allow eprouvette - [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3923/ ` == 200 ] + [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3921/ ` == 200 ] garage -c /tmp/config.1.toml bucket website --deny eprouvette - [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3923/ ` == 404 ] + [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.garage.tld" http://127.0.0.1:3921/ ` == 404 ] aws s3 rm s3://eprouvette/index.html rm /tmp/garage-index.html fi diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index f06c67e4..3ca46764 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_api" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "S3 API server crate for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" @@ -13,9 +14,9 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_model = { version = "0.4.0", path = "../model" } -garage_table = { version = "0.4.0", path = "../table" } -garage_util = { version = "0.4.0", path = "../util" } +garage_model = { version = "0.5.0", path = "../model" } +garage_table = { version = "0.5.0", path = "../table" } +garage_util = { version = "0.5.0", path = "../util" } base64 = "0.13" bytes = "1.0" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 0a3bc537..74a6ab0e 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "Garage, an S3-compatible distributed object store for self-hosted deployments" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [[bin]] name = "garage" @@ -14,12 +15,12 @@ path = "main.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_api = { version = "0.4.0", path = "../api" } -garage_model = { version = "0.4.0", path = "../model" } -garage_rpc = { version = "0.4.0", path = "../rpc" } -garage_table = { version = "0.4.0", path = "../table" } -garage_util = { version = "0.4.0", path = "../util" } -garage_web = { version = "0.4.0", path = "../web" } +garage_api = { version = "0.5.0", path = "../api" } +garage_model = { version = "0.5.0", path = "../model" } +garage_rpc = { version = "0.5.0", path = "../rpc" } +garage_table = { version = "0.5.0", path = "../table" } +garage_util = { version = "0.5.0", path = "../util" } +garage_web = { version = "0.5.0", path = "../web" } bytes = "1.0" git-version = "0.3.4" diff --git a/src/garage/admin.rs b/src/garage/admin.rs index c3a83d02..f0444988 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -339,7 +339,7 @@ impl AdminRpcHandler { let mut failures = vec![]; let ring = self.garage.system.ring.borrow().clone(); - for node in ring.config.members.keys() { + for node in ring.layout.node_ids().iter() { let node = (*node).into(); let resp = self .endpoint @@ -383,7 +383,7 @@ impl AdminRpcHandler { let mut ret = String::new(); let ring = self.garage.system.ring.borrow().clone(); - for node in ring.config.members.keys() { + for node in ring.layout.node_ids().iter() { let mut opt = opt.clone(); opt.all_nodes = false; diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 2ff46088..a916974e 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -2,7 +2,7 @@ use std::collections::HashSet; use garage_util::error::*; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::*; use garage_rpc::*; @@ -20,11 +20,8 @@ pub async fn cli_command_dispatch( Command::Node(NodeOperation::Connect(connect_opt)) => { cmd_connect(system_rpc_endpoint, rpc_host, connect_opt).await } - Command::Node(NodeOperation::Configure(configure_opt)) => { - cmd_configure(system_rpc_endpoint, rpc_host, configure_opt).await - } - Command::Node(NodeOperation::Remove(remove_opt)) => { - cmd_remove(system_rpc_endpoint, rpc_host, remove_opt).await + Command::Layout(layout_opt) => { + cli_layout_command_dispatch(layout_opt, system_rpc_endpoint, rpc_host).await } Command::Bucket(bo) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BucketOperation(bo)).await @@ -48,56 +45,60 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> SystemRpc::ReturnKnownNodes(nodes) => nodes, resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), }; - let config = match rpc_cli - .call(&rpc_host, &SystemRpc::PullConfig, PRIO_NORMAL) - .await?? - { - SystemRpc::AdvertiseConfig(cfg) => cfg, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; + let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== HEALTHY NODES ===="); - let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTag\tZone\tCapacity".to_string()]; + let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - if let Some(cfg) = config.members.get(&adv.id) { - healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tag}]\t{zone}\t{capacity}", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tag = cfg.tag, - zone = cfg.zone, - capacity = cfg.capacity_string(), - )); - } else { - healthy_nodes.push(format!( - "{id:?}\t{h}\t{addr}\tNO ROLE ASSIGNED", - id = adv.id, - h = adv.status.hostname, - addr = adv.addr, - )); + match layout.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => { + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = cfg.capacity_string(), + )); + } + _ => { + let new_role = match layout.staging.get(&adv.id) { + Some(NodeRoleV(Some(_))) => "(pending)", + _ => "NO ROLE ASSIGNED", + }; + healthy_nodes.push(format!( + "{id:?}\t{h}\t{addr}\t{new_role}", + id = adv.id, + h = adv.status.hostname, + addr = adv.addr, + new_role = new_role, + )); + } } } format_table(healthy_nodes); let status_keys = status.iter().map(|adv| adv.id).collect::>(); let failure_case_1 = status.iter().any(|adv| !adv.is_up); - let failure_case_2 = config - .members + let failure_case_2 = layout + .roles + .items() .iter() - .any(|(id, _)| !status_keys.contains(id)); + .filter(|(_, _, v)| v.0.is_some()) + .any(|(id, _, _)| !status_keys.contains(id)); if failure_case_1 || failure_case_2 { println!("\n==== FAILED NODES ===="); let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTag\tZone\tCapacity\tLast seen".to_string()]; + vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(cfg) = config.members.get(&adv.id) { + if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tag}]\t{zone}\t{capacity}\t{last_seen}", + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", id = adv.id, host = adv.status.hostname, addr = adv.addr, - tag = cfg.tag, + tags = cfg.tags.join(","), zone = cfg.zone, capacity = cfg.capacity_string(), last_seen = adv @@ -107,20 +108,28 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } } - for (id, cfg) in config.members.iter() { - if !status_keys.contains(id) { - failed_nodes.push(format!( - "{id:?}\t??\t??\t[{tag}]\t{zone}\t{capacity}\tnever seen", - id = id, - tag = cfg.tag, - zone = cfg.zone, - capacity = cfg.capacity_string(), - )); + for (id, _, role_v) in layout.roles.items().iter() { + if let NodeRoleV(Some(cfg)) = role_v { + if !status_keys.contains(id) { + failed_nodes.push(format!( + "{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", + id = id, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = cfg.capacity_string(), + )); + } } } format_table(failed_nodes); } + if print_staging_role_changes(&layout) { + println!(); + println!("Please use `garage layout show` to check the proposed new layout and apply it."); + println!(); + } + Ok(()) } @@ -141,115 +150,6 @@ pub async fn cmd_connect( } } -pub async fn cmd_configure( - rpc_cli: &Endpoint, - rpc_host: NodeID, - args: ConfigureNodeOpt, -) -> Result<(), Error> { - let status = match rpc_cli - .call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL) - .await?? - { - SystemRpc::ReturnKnownNodes(nodes) => nodes, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; - - let added_node = find_matching_node(status.iter().map(|adv| adv.id), &args.node_id)?; - - let mut config = match rpc_cli - .call(&rpc_host, &SystemRpc::PullConfig, PRIO_NORMAL) - .await?? - { - SystemRpc::AdvertiseConfig(cfg) => cfg, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; - - for replaced in args.replace.iter() { - let replaced_node = find_matching_node(config.members.keys().cloned(), replaced)?; - if config.members.remove(&replaced_node).is_none() { - return Err(Error::Message(format!( - "Cannot replace node {:?} as it is not in current configuration", - replaced_node - ))); - } - } - - if args.capacity.is_some() && args.gateway { - return Err(Error::Message( - "-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into())); - } - if args.capacity == Some(0) { - return Err(Error::Message("Invalid capacity value: 0".into())); - } - - let new_entry = match config.members.get(&added_node) { - None => { - let capacity = match args.capacity { - Some(c) => Some(c), - None if args.gateway => None, - _ => return Err(Error::Message( - "Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())), - }; - NetworkConfigEntry { - zone: args.zone.ok_or("Please specifiy a zone with the -z flag")?, - capacity, - tag: args.tag.unwrap_or_default(), - } - } - Some(old) => { - let capacity = match args.capacity { - Some(c) => Some(c), - None if args.gateway => None, - _ => old.capacity, - }; - NetworkConfigEntry { - zone: args.zone.unwrap_or_else(|| old.zone.to_string()), - capacity, - tag: args.tag.unwrap_or_else(|| old.tag.to_string()), - } - } - }; - - config.members.insert(added_node, new_entry); - config.version += 1; - - rpc_cli - .call(&rpc_host, &SystemRpc::AdvertiseConfig(config), PRIO_NORMAL) - .await??; - Ok(()) -} - -pub async fn cmd_remove( - rpc_cli: &Endpoint, - rpc_host: NodeID, - args: RemoveNodeOpt, -) -> Result<(), Error> { - let mut config = match rpc_cli - .call(&rpc_host, &SystemRpc::PullConfig, PRIO_NORMAL) - .await?? - { - SystemRpc::AdvertiseConfig(cfg) => cfg, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; - - let deleted_node = find_matching_node(config.members.keys().cloned(), &args.node_id)?; - - if !args.yes { - return Err(Error::Message(format!( - "Add the flag --yes to really remove {:?} from the cluster", - deleted_node - ))); - } - - config.members.remove(&deleted_node); - config.version += 1; - - rpc_cli - .call(&rpc_host, &SystemRpc::AdvertiseConfig(config), PRIO_NORMAL) - .await??; - Ok(()) -} - pub async fn cmd_admin( rpc_cli: &Endpoint, rpc_host: NodeID, @@ -283,5 +183,3 @@ pub async fn cmd_admin( } Ok(()) } - -// --- Utility functions ---- diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs new file mode 100644 index 00000000..0d9e4fa4 --- /dev/null +++ b/src/garage/cli/layout.rs @@ -0,0 +1,340 @@ +use garage_util::crdt::Crdt; +use garage_util::data::*; +use garage_util::error::*; + +use garage_rpc::layout::*; +use garage_rpc::system::*; +use garage_rpc::*; + +use crate::cli::*; + +pub async fn cli_layout_command_dispatch( + cmd: LayoutOperation, + system_rpc_endpoint: &Endpoint, + rpc_host: NodeID, +) -> Result<(), Error> { + match cmd { + LayoutOperation::Assign(configure_opt) => { + cmd_assign_role(system_rpc_endpoint, rpc_host, configure_opt).await + } + LayoutOperation::Remove(remove_opt) => { + cmd_remove_role(system_rpc_endpoint, rpc_host, remove_opt).await + } + LayoutOperation::Show => cmd_show_layout(system_rpc_endpoint, rpc_host).await, + LayoutOperation::Apply(apply_opt) => { + cmd_apply_layout(system_rpc_endpoint, rpc_host, apply_opt).await + } + LayoutOperation::Revert(revert_opt) => { + cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await + } + } +} + +pub async fn cmd_assign_role( + rpc_cli: &Endpoint, + rpc_host: NodeID, + args: AssignRoleOpt, +) -> Result<(), Error> { + let status = match rpc_cli + .call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL) + .await?? + { + SystemRpc::ReturnKnownNodes(nodes) => nodes, + resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + }; + + let added_node = find_matching_node(status.iter().map(|adv| adv.id), &args.node_id)?; + + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + let mut roles = layout.roles.clone(); + roles.merge(&layout.staging); + + for replaced in args.replace.iter() { + let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; + match roles.get(&replaced_node) { + Some(NodeRoleV(Some(_))) => { + layout + .staging + .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); + } + _ => { + return Err(Error::Message(format!( + "Cannot replace node {:?} as it is not currently in planned layout", + replaced_node + ))); + } + } + } + + if args.capacity.is_some() && args.gateway { + return Err(Error::Message( + "-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into())); + } + if args.capacity == Some(0) { + return Err(Error::Message("Invalid capacity value: 0".into())); + } + + let new_entry = match roles.get(&added_node) { + Some(NodeRoleV(Some(old))) => { + let capacity = match args.capacity { + Some(c) => Some(c), + None if args.gateway => None, + None => old.capacity, + }; + let tags = if args.tags.is_empty() { + old.tags.clone() + } else { + args.tags + }; + NodeRole { + zone: args.zone.unwrap_or_else(|| old.zone.to_string()), + capacity, + tags, + } + } + _ => { + let capacity = match args.capacity { + Some(c) => Some(c), + None if args.gateway => None, + None => return Err(Error::Message( + "Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())), + }; + NodeRole { + zone: args.zone.ok_or("Please specifiy a zone with the -z flag")?, + capacity, + tags: args.tags, + } + } + }; + + layout + .staging + .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); + + send_layout(rpc_cli, rpc_host, layout).await?; + + println!("Role change is staged but not yet commited."); + println!("Use `garage layout show` to view staged role changes,"); + println!("and `garage layout apply` to enact staged changes."); + Ok(()) +} + +pub async fn cmd_remove_role( + rpc_cli: &Endpoint, + rpc_host: NodeID, + args: RemoveRoleOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + let mut roles = layout.roles.clone(); + roles.merge(&layout.staging); + + let deleted_node = + find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; + + layout + .staging + .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); + + send_layout(rpc_cli, rpc_host, layout).await?; + + println!("Role removal is staged but not yet commited."); + println!("Use `garage layout show` to view staged role changes,"); + println!("and `garage layout apply` to enact staged changes."); + Ok(()) +} + +pub async fn cmd_show_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + println!("==== CURRENT CLUSTER LAYOUT ===="); + if !print_cluster_layout(&layout) { + println!("No nodes currently have a role in the cluster."); + println!("See `garage status` to view available nodes."); + } + println!(); + println!("Current cluster layout version: {}", layout.version); + + if print_staging_role_changes(&layout) { + layout.roles.merge(&layout.staging); + + println!(); + println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); + if !print_cluster_layout(&layout) { + println!("No nodes have a role in the new layout."); + } + println!(); + + // this will print the stats of what partitions + // will move around when we apply + if layout.calculate_partition_assignation() { + println!("To enact the staged role changes, type:"); + println!(); + println!(" garage layout apply --version {}", layout.version + 1); + println!(); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + layout.version + 1 + ); + } else { + println!("Not enough nodes have an assigned role to maintain enough copies of data."); + println!("This new layout cannot yet be applied."); + } + } + + Ok(()) +} + +pub async fn cmd_apply_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, + apply_opt: ApplyLayoutOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + match apply_opt.version { + None => { + println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout."); + println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes."); + return Err(Error::Message("--version flag is missing".into())); + } + Some(v) => { + if v != layout.version + 1 { + return Err(Error::Message("Invalid value of --version flag".into())); + } + } + } + + layout.roles.merge(&layout.staging); + + if !layout.calculate_partition_assignation() { + return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); + } + + layout.staging.clear(); + layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]); + + layout.version += 1; + + send_layout(rpc_cli, rpc_host, layout).await?; + + println!("New cluster layout with updated role assignation has been applied in cluster."); + println!("Data will now be moved around between nodes accordingly."); + + Ok(()) +} + +pub async fn cmd_revert_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, + revert_opt: RevertLayoutOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + match revert_opt.version { + None => { + println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout."); + println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes."); + return Err(Error::Message("--version flag is missing".into())); + } + Some(v) => { + if v != layout.version + 1 { + return Err(Error::Message("Invalid value of --version flag".into())); + } + } + } + + layout.staging.clear(); + layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]); + + layout.version += 1; + + send_layout(rpc_cli, rpc_host, layout).await?; + + println!("All proposed role changes in cluster layout have been canceled."); + Ok(()) +} + +// --- utility --- + +pub async fn fetch_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result { + match rpc_cli + .call(&rpc_host, &SystemRpc::PullClusterLayout, PRIO_NORMAL) + .await?? + { + SystemRpc::AdvertiseClusterLayout(t) => Ok(t), + resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + } +} + +pub async fn send_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, + layout: ClusterLayout, +) -> Result<(), Error> { + rpc_cli + .call( + &rpc_host, + &SystemRpc::AdvertiseClusterLayout(layout), + PRIO_NORMAL, + ) + .await??; + Ok(()) +} + +pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { + let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; + for (id, _, role) in layout.roles.items().iter() { + let role = match &role.0 { + Some(r) => r, + _ => continue, + }; + let tags = role.tags.join(","); + table.push(format!( + "{:?}\t{}\t{}\t{}", + id, + tags, + role.zone, + role.capacity_string() + )); + } + if table.len() == 1 { + false + } else { + format_table(table); + true + } +} + +pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { + if !layout.staging.items().is_empty() { + println!(); + println!("==== STAGED ROLE CHANGES ===="); + let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; + for (id, _, role) in layout.staging.items().iter() { + if let Some(role) = &role.0 { + let tags = role.tags.join(","); + table.push(format!( + "{:?}\t{}\t{}\t{}", + id, + tags, + role.zone, + role.capacity_string() + )); + } else { + table.push(format!("{:?}\tREMOVED", id)); + } + } + format_table(table); + true + } else { + false + } +} diff --git a/src/garage/cli/mod.rs b/src/garage/cli/mod.rs index 1567f377..17a2d8ce 100644 --- a/src/garage/cli/mod.rs +++ b/src/garage/cli/mod.rs @@ -1,9 +1,11 @@ pub(crate) mod cmd; pub(crate) mod init; +pub(crate) mod layout; pub(crate) mod structs; pub(crate) mod util; pub(crate) use cmd::*; pub(crate) use init::*; +pub(crate) use layout::*; pub(crate) use structs::*; pub(crate) use util::*; diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index b930d8a8..b2b5375d 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -8,23 +8,23 @@ pub enum Command { #[structopt(name = "server")] Server, - /// Print identifier (public key) of this Garage node - #[structopt(name = "node-id")] - NodeId(NodeIdOpt), - /// Get network status #[structopt(name = "status")] Status, - /// Garage node operations + /// Operations on individual Garage nodes #[structopt(name = "node")] Node(NodeOperation), - /// Bucket operations + /// Operations on the assignation of node roles in the cluster layout + #[structopt(name = "layout")] + Layout(LayoutOperation), + + /// Operations on buckets #[structopt(name = "bucket")] Bucket(BucketOperation), - /// Key operations + /// Operations on S3 access keys #[structopt(name = "key")] Key(KeyOperation), @@ -39,17 +39,13 @@ pub enum Command { #[derive(StructOpt, Debug)] pub enum NodeOperation { + /// Print identifier (public key) of this Garage node + #[structopt(name = "id")] + NodeId(NodeIdOpt), + /// Connect to Garage node that is currently isolated from the system #[structopt(name = "connect")] Connect(ConnectNodeOpt), - - /// Configure Garage node - #[structopt(name = "configure")] - Configure(ConfigureNodeOpt), - - /// Remove Garage node from cluster - #[structopt(name = "remove")] - Remove(RemoveNodeOpt), } #[derive(StructOpt, Debug)] @@ -67,8 +63,31 @@ pub struct ConnectNodeOpt { } #[derive(StructOpt, Debug)] -pub struct ConfigureNodeOpt { - /// Node to configure (prefix of hexadecimal node id) +pub enum LayoutOperation { + /// Assign role to Garage node + #[structopt(name = "assign")] + Assign(AssignRoleOpt), + + /// Remove role from Garage cluster node + #[structopt(name = "remove")] + Remove(RemoveRoleOpt), + + /// Show roles currently assigned to nodes and changes staged for commit + #[structopt(name = "show")] + Show, + + /// Apply staged changes to cluster layout + #[structopt(name = "apply")] + Apply(ApplyLayoutOpt), + + /// Revert staged changes to cluster layout + #[structopt(name = "revert")] + Revert(RevertLayoutOpt), +} + +#[derive(StructOpt, Debug)] +pub struct AssignRoleOpt { + /// Node to which to assign role (prefix of hexadecimal node id) pub(crate) node_id: String, /// Location (zone or datacenter) of the node @@ -83,9 +102,9 @@ pub struct ConfigureNodeOpt { #[structopt(short = "g", long = "gateway")] pub(crate) gateway: bool, - /// Optional node tag + /// Optional tags to add to node #[structopt(short = "t", long = "tag")] - pub(crate) tag: Option, + pub(crate) tags: Vec, /// Replaced node(s): list of node IDs that will be removed from the current cluster #[structopt(long = "replace")] @@ -93,13 +112,24 @@ pub struct ConfigureNodeOpt { } #[derive(StructOpt, Debug)] -pub struct RemoveNodeOpt { - /// Node to configure (prefix of hexadecimal node id) +pub struct RemoveRoleOpt { + /// Node whose role to remove (prefix of hexadecimal node id) pub(crate) node_id: String, +} - /// If this flag is not given, the node won't be removed - #[structopt(long = "yes")] - pub(crate) yes: bool, +#[derive(StructOpt, Debug)] +pub struct ApplyLayoutOpt { + /// Version number of new configuration: this command will fail if + /// it is not exactly 1 + the previous configuration's version + #[structopt(long = "version")] + pub(crate) version: Option, +} + +#[derive(StructOpt, Debug)] +pub struct RevertLayoutOpt { + /// Version number of old configuration to which to revert + #[structopt(long = "version")] + pub(crate) version: Option, } #[derive(Serialize, Deserialize, StructOpt, Debug)] diff --git a/src/garage/main.rs b/src/garage/main.rs index 70c959f8..69cd16e7 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -70,7 +70,9 @@ async fn main() { server::run_server(opt.config_file).await } - Command::NodeId(node_id_opt) => node_id_command(opt.config_file, node_id_opt.quiet), + Command::Node(NodeOperation::NodeId(node_id_opt)) => { + node_id_command(opt.config_file, node_id_opt.quiet) + } _ => cli_command(opt).await, }; diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index f4085c13..7979a79a 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_model" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "Core data model for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" @@ -13,9 +14,9 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_rpc = { version = "0.4.0", path = "../rpc" } -garage_table = { version = "0.4.0", path = "../table" } -garage_util = { version = "0.4.0", path = "../util" } +garage_rpc = { version = "0.5.0", path = "../rpc" } +garage_table = { version = "0.5.0", path = "../table" } +garage_util = { version = "0.5.0", path = "../util" } async-trait = "0.1.7" arc-swap = "1.0" diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index ac7c2a2e..d8ebb71e 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_rpc" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "Cluster membership management and RPC protocol for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" @@ -13,7 +14,7 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_util = { version = "0.4.0", path = "../util" } +garage_util = { version = "0.5.0", path = "../util" } arc-swap = "1.0" bytes = "1.0" @@ -26,6 +27,7 @@ sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } async-trait = "0.1.7" rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } +serde_bytes = "0.11" serde_json = "1.0" futures = "0.3" diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs new file mode 100644 index 00000000..895dbf1c --- /dev/null +++ b/src/rpc/layout.rs @@ -0,0 +1,579 @@ +use std::cmp::Ordering; +use std::collections::{HashMap, HashSet}; + +use serde::{Deserialize, Serialize}; + +use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; +use garage_util::data::*; + +use crate::ring::*; + +/// The layout of the cluster, i.e. the list of roles +/// which are assigned to each cluster node +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec, + /// the assignation of data partitions to node, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap, + pub staging_hash: Hash, +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct NodeRoleV(pub Option); + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +/// The user-assigned roles of cluster nodes +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct NodeRole { + /// Datacenter at which this entry belong. This information might be used to perform a better + /// geodistribution + pub zone: String, + /// The (relative) capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option, + /// A set of tags to recognize the node + pub tags: Vec, +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => format!("{}", c), + None => "gateway".to_string(), + } + } +} + +impl ClusterLayout { + pub fn new(replication_factor: usize) -> Self { + let empty_lwwmap = LwwMap::new(); + let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); + + ClusterLayout { + version: 0, + replication_factor, + roles: LwwMap::new(), + node_id_vec: Vec::new(), + ring_assignation_data: Vec::new(), + staging: empty_lwwmap, + staging_hash: empty_lwwmap_hash, + } + } + + pub fn merge(&mut self, other: &ClusterLayout) -> bool { + match other.version.cmp(&self.version) { + Ordering::Greater => { + *self = other.clone(); + true + } + Ordering::Equal => { + self.staging.merge(&other.staging); + + let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + let changed = new_staging_hash != self.staging_hash; + + self.staging_hash = new_staging_hash; + + changed + } + Ordering::Less => false, + } + } + + /// Returns a list of IDs of nodes that currently have + /// a role in the cluster + pub fn node_ids(&self) -> &[Uuid] { + &self.node_id_vec[..] + } + + pub fn num_nodes(&self) -> usize { + self.node_id_vec.len() + } + + /// Returns the role of a node in the layout + pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { + match self.roles.get(node) { + Some(NodeRoleV(Some(v))) => Some(v), + _ => None, + } + } + + /// Check a cluster layout for internal consistency + /// returns true if consistent, false if error + pub fn check(&self) -> bool { + // Check that the hash of the staging data is correct + let staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + if staging_hash != self.staging_hash { + return false; + } + + // Check that node_id_vec contains the correct list of nodes + let mut expected_nodes = self + .roles + .items() + .iter() + .filter(|(_, _, v)| v.0.is_some()) + .map(|(id, _, _)| *id) + .collect::>(); + expected_nodes.sort(); + let mut node_id_vec = self.node_id_vec.clone(); + node_id_vec.sort(); + if expected_nodes != node_id_vec { + return false; + } + + // Check that the assignation data has the correct length + if self.ring_assignation_data.len() != (1 << PARTITION_BITS) * self.replication_factor { + return false; + } + + // Check that the assigned nodes are correct identifiers + // of nodes that are assigned a role + // and that role is not the role of a gateway nodes + for x in self.ring_assignation_data.iter() { + if *x as usize >= self.node_id_vec.len() { + return false; + } + let node = self.node_id_vec[*x as usize]; + match self.roles.get(&node) { + Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), + _ => return false, + } + } + + true + } + + /// Calculate an assignation of partitions to nodes + pub fn calculate_partition_assignation(&mut self) -> bool { + let (configured_nodes, zones) = self.configured_nodes_and_zones(); + let n_zones = zones.len(); + + println!("Calculating updated partition assignation, this may take some time..."); + println!(); + + let old_partitions = self.parse_assignation_data(); + + let mut partitions = old_partitions.clone(); + for part in partitions.iter_mut() { + part.nodes + .retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false)); + } + + // When nodes are removed, or when bootstraping an assignation from + // scratch for a new cluster, the old partitions will have holes (or be empty). + // Here we add more nodes to make a complete (sub-optimal) assignation, + // using an initial partition assignation that is calculated using the multi-dc maglev trick + match self.initial_partition_assignation() { + Some(initial_partitions) => { + for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) { + for (id, info) in ipart.nodes.iter() { + if part.nodes.len() < self.replication_factor { + part.add(part.nodes.len() + 1, n_zones, id, info.unwrap()); + } + } + assert!(part.nodes.len() == self.replication_factor); + } + } + None => { + return false; + } + } + + // Calculate how many partitions each node should ideally store, + // and how many partitions they are storing with the current assignation + // This defines our target for which we will optimize in the following loop. + let total_capacity = configured_nodes + .iter() + .map(|(_, info)| info.capacity.unwrap_or(0)) + .sum::() as usize; + let total_partitions = self.replication_factor * (1 << PARTITION_BITS); + let target_partitions_per_node = configured_nodes + .iter() + .map(|(id, info)| { + ( + *id, + info.capacity.unwrap_or(0) as usize * total_partitions / total_capacity, + ) + }) + .collect::>(); + + let mut partitions_per_node = self.partitions_per_node(&partitions[..]); + + println!("Target number of partitions per node:"); + for (node, npart) in target_partitions_per_node.iter() { + println!("{:?}\t{}", node, npart); + } + println!(); + + // Shuffle partitions between nodes so that nodes will reach (or better approach) + // their target number of stored partitions + loop { + let mut option = None; + for (i, part) in partitions.iter_mut().enumerate() { + for (irm, (idrm, _)) in part.nodes.iter().enumerate() { + let suprm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as i32 + - target_partitions_per_node.get(*idrm).cloned().unwrap_or(0) as i32; + + for (idadd, infoadd) in configured_nodes.iter() { + // skip replacing a node by itself + // and skip replacing by gateway nodes + if idadd == idrm || infoadd.capacity.is_none() { + continue; + } + + let supadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as i32 + - target_partitions_per_node.get(*idadd).cloned().unwrap_or(0) as i32; + + // We want to try replacing node idrm by node idadd + // if that brings us close to our goal. + let square = |i: i32| i * i; + let oldcost = square(suprm) + square(supadd); + let newcost = square(suprm - 1) + square(supadd + 1); + if newcost >= oldcost { + // not closer to our goal + continue; + } + let gain = oldcost - newcost; + + let mut newpart = part.clone(); + + newpart.nodes.remove(irm); + if !newpart.add(newpart.nodes.len() + 1, n_zones, idadd, infoadd) { + continue; + } + assert!(newpart.nodes.len() == self.replication_factor); + + if !old_partitions[i] + .is_valid_transition_to(&newpart, self.replication_factor) + { + continue; + } + + if option + .as_ref() + .map(|(old_gain, _, _, _, _)| gain > *old_gain) + .unwrap_or(true) + { + option = Some((gain, i, idadd, idrm, newpart)); + } + } + } + } + if let Some((_gain, i, idadd, idrm, newpart)) = option { + *partitions_per_node.entry(idadd).or_insert(0) += 1; + *partitions_per_node.get_mut(idrm).unwrap() -= 1; + partitions[i] = newpart; + } else { + break; + } + } + + // Check we completed the assignation correctly + // (this is a set of checks for the algorithm's consistency) + assert!(partitions.len() == (1 << PARTITION_BITS)); + assert!(partitions + .iter() + .all(|p| p.nodes.len() == self.replication_factor)); + + let new_partitions_per_node = self.partitions_per_node(&partitions[..]); + assert!(new_partitions_per_node == partitions_per_node); + + // Show statistics + println!("New number of partitions per node:"); + for (node, npart) in partitions_per_node.iter() { + println!("{:?}\t{}", node, npart); + } + println!(); + + let mut diffcount = HashMap::new(); + for (oldpart, newpart) in old_partitions.iter().zip(partitions.iter()) { + let nminus = oldpart.txtplus(newpart); + let nplus = newpart.txtplus(oldpart); + if nminus != "[...]" || nplus != "[...]" { + let tup = (nminus, nplus); + *diffcount.entry(tup).or_insert(0) += 1; + } + } + if diffcount.is_empty() { + println!("No data will be moved between nodes."); + } else { + let mut diffcount = diffcount.into_iter().collect::>(); + diffcount.sort(); + println!("Number of partitions that move:"); + for ((nminus, nplus), npart) in diffcount { + println!("\t{}\t{} -> {}", npart, nminus, nplus); + } + } + println!(); + + // Calculate and save new assignation data + let (nodes, assignation_data) = + self.compute_assignation_data(&configured_nodes[..], &partitions[..]); + + self.node_id_vec = nodes; + self.ring_assignation_data = assignation_data; + + true + } + + fn initial_partition_assignation(&self) -> Option>> { + let (configured_nodes, zones) = self.configured_nodes_and_zones(); + let n_zones = zones.len(); + + // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) + let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::>(); + + // Prepare ring + let mut partitions: Vec = partitions_idx + .iter() + .map(|_i| PartitionAss::new()) + .collect::>(); + + // Create MagLev priority queues for each node + let mut queues = configured_nodes + .iter() + .filter(|(_id, info)| info.capacity.is_some()) + .map(|(node_id, node_info)| { + let mut parts = partitions_idx + .iter() + .map(|i| { + let part_data = + [&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat(); + (*i, fasthash(&part_data[..])) + }) + .collect::>(); + parts.sort_by_key(|(_i, h)| *h); + let parts_i = parts.iter().map(|(i, _h)| *i).collect::>(); + (node_id, node_info, parts_i, 0) + }) + .collect::>(); + + let max_capacity = configured_nodes + .iter() + .filter_map(|(_, node_info)| node_info.capacity) + .fold(0, std::cmp::max); + + // Fill up ring + for rep in 0..self.replication_factor { + queues.sort_by_key(|(ni, _np, _q, _p)| { + let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat(); + fasthash(&queue_data[..]) + }); + + for (_, _, _, pos) in queues.iter_mut() { + *pos = 0; + } + + let mut remaining = partitions_idx.len(); + while remaining > 0 { + let remaining0 = remaining; + for i_round in 0..max_capacity { + for (node_id, node_info, q, pos) in queues.iter_mut() { + if i_round >= node_info.capacity.unwrap() { + continue; + } + for (pos2, &qv) in q.iter().enumerate().skip(*pos) { + if partitions[qv].add(rep + 1, n_zones, node_id, node_info) { + remaining -= 1; + *pos = pos2 + 1; + break; + } + } + } + } + if remaining == remaining0 { + // No progress made, exit + return None; + } + } + } + + Some(partitions) + } + + fn configured_nodes_and_zones(&self) -> (Vec<(&Uuid, &NodeRole)>, HashSet<&str>) { + let configured_nodes = self + .roles + .items() + .iter() + .filter(|(_id, _, info)| info.0.is_some()) + .map(|(id, _, info)| (id, info.0.as_ref().unwrap())) + .collect::>(); + + let zones = configured_nodes + .iter() + .filter(|(_id, info)| info.capacity.is_some()) + .map(|(_id, info)| info.zone.as_str()) + .collect::>(); + + (configured_nodes, zones) + } + + fn compute_assignation_data<'a>( + &self, + configured_nodes: &[(&'a Uuid, &'a NodeRole)], + partitions: &[PartitionAss<'a>], + ) -> (Vec, Vec) { + assert!(partitions.len() == (1 << PARTITION_BITS)); + + // Make a canonical order for nodes + let mut nodes = configured_nodes + .iter() + .filter(|(_id, info)| info.capacity.is_some()) + .map(|(id, _)| **id) + .collect::>(); + let nodes_rev = nodes + .iter() + .enumerate() + .map(|(i, id)| (*id, i as CompactNodeType)) + .collect::>(); + + let mut assignation_data = vec![]; + for partition in partitions.iter() { + assert!(partition.nodes.len() == self.replication_factor); + for (id, _) in partition.nodes.iter() { + assignation_data.push(*nodes_rev.get(id).unwrap()); + } + } + + nodes.extend( + configured_nodes + .iter() + .filter(|(_id, info)| info.capacity.is_none()) + .map(|(id, _)| **id), + ); + + (nodes, assignation_data) + } + + fn parse_assignation_data(&self) -> Vec> { + if self.ring_assignation_data.len() == self.replication_factor * (1 << PARTITION_BITS) { + // If the previous assignation data is correct, use that + let mut partitions = vec![]; + for i in 0..(1 << PARTITION_BITS) { + let mut part = PartitionAss::new(); + for node_i in self.ring_assignation_data + [i * self.replication_factor..(i + 1) * self.replication_factor] + .iter() + { + let node_id = &self.node_id_vec[*node_i as usize]; + + if let Some(NodeRoleV(Some(info))) = self.roles.get(node_id) { + part.nodes.push((node_id, Some(info))); + } else { + part.nodes.push((node_id, None)); + } + } + partitions.push(part); + } + partitions + } else { + // Otherwise start fresh + (0..(1 << PARTITION_BITS)) + .map(|_| PartitionAss::new()) + .collect() + } + } + + fn partitions_per_node<'a>(&self, partitions: &[PartitionAss<'a>]) -> HashMap<&'a Uuid, usize> { + let mut partitions_per_node = HashMap::<&Uuid, usize>::new(); + for p in partitions.iter() { + for (id, _) in p.nodes.iter() { + *partitions_per_node.entry(*id).or_insert(0) += 1; + } + } + partitions_per_node + } +} + +// ---- Internal structs for partition assignation in layout ---- + +#[derive(Clone)] +struct PartitionAss<'a> { + nodes: Vec<(&'a Uuid, Option<&'a NodeRole>)>, +} + +impl<'a> PartitionAss<'a> { + fn new() -> Self { + Self { nodes: Vec::new() } + } + + fn nplus(&self, other: &PartitionAss<'a>) -> usize { + self.nodes + .iter() + .filter(|x| !other.nodes.contains(x)) + .count() + } + + fn txtplus(&self, other: &PartitionAss<'a>) -> String { + let mut nodes = self + .nodes + .iter() + .filter(|x| !other.nodes.contains(x)) + .map(|x| format!("{:?}", x.0)) + .collect::>(); + nodes.sort(); + if self.nodes.iter().any(|x| other.nodes.contains(x)) { + nodes.push("...".into()); + } + format!("[{}]", nodes.join(" ")) + } + + fn is_valid_transition_to(&self, other: &PartitionAss<'a>, replication_factor: usize) -> bool { + let min_keep_nodes_per_part = (replication_factor + 1) / 2; + let n_removed = self.nplus(other); + + if self.nodes.len() <= min_keep_nodes_per_part { + n_removed == 0 + } else { + n_removed <= self.nodes.len() - min_keep_nodes_per_part + } + } + + fn add( + &mut self, + target_len: usize, + n_zones: usize, + node: &'a Uuid, + role: &'a NodeRole, + ) -> bool { + if self.nodes.len() != target_len - 1 { + return false; + } + + let p_zns = self + .nodes + .iter() + .map(|(_id, info)| info.unwrap().zone.as_str()) + .collect::>(); + if (p_zns.len() < n_zones && !p_zns.contains(&role.zone.as_str())) + || (p_zns.len() == n_zones && !self.nodes.iter().any(|(id, _)| *id == node)) + { + self.nodes.push((node, Some(role))); + true + } else { + false + } + } +} diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index ea3f1139..b72392ab 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -5,6 +5,7 @@ extern crate log; mod consul; +pub mod layout; pub mod ring; pub mod system; diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs index 3cb0d233..73a126a2 100644 --- a/src/rpc/ring.rs +++ b/src/rpc/ring.rs @@ -1,12 +1,11 @@ //! Module containing types related to computing nodes which should receive a copy of data blocks //! and metadata -use std::collections::{HashMap, HashSet}; use std::convert::TryInto; -use serde::{Deserialize, Serialize}; - use garage_util::data::*; +use crate::layout::ClusterLayout; + /// A partition id, which is stored on 16 bits /// i.e. we have up to 2**16 partitions. /// (in practice we have exactly 2**PARTITION_BITS partitions) @@ -22,47 +21,6 @@ pub const PARTITION_BITS: usize = 8; const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS); -/// The user-defined configuration of the cluster's nodes -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct NetworkConfig { - /// Map of each node's id to it's configuration - pub members: HashMap, - /// Version of this config - pub version: u64, -} - -impl NetworkConfig { - pub(crate) fn new() -> Self { - Self { - members: HashMap::new(), - version: 0, - } - } -} - -/// The overall configuration of one (possibly remote) node -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct NetworkConfigEntry { - /// Datacenter at which this entry belong. This infromation might be used to perform a better - /// geodistribution - pub zone: String, - /// The (relative) capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option, - /// A tag to recognize the entry, not used for other things than display - pub tag: String, -} - -impl NetworkConfigEntry { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => format!("{}", c), - None => "gateway".to_string(), - } - } -} - /// A ring distributing fairly objects to nodes #[derive(Clone)] pub struct Ring { @@ -70,7 +28,7 @@ pub struct Ring { pub replication_factor: usize, /// The network configuration used to generate this ring - pub config: NetworkConfig, + pub layout: ClusterLayout, // Internal order of nodes used to make a more compact representation of the ring nodes: Vec, @@ -81,7 +39,7 @@ pub struct Ring { // Type to store compactly the id of a node in the system // Change this to u16 the day we want to have more than 256 nodes in a cluster -type CompactNodeType = u8; +pub type CompactNodeType = u8; // The maximum number of times an object might get replicated // This must be at least 3 because Garage supports 3-way replication @@ -102,132 +60,26 @@ struct RingEntry { } impl Ring { - // TODO this function MUST be refactored, it's 100 lines long, with a 50 lines loop, going up to 6 - // levels of imbrication. It is basically impossible to test, maintain, or understand for an - // outsider. - pub(crate) fn new(config: NetworkConfig, replication_factor: usize) -> Self { - // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) - let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::>(); - - let zones = config - .members - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(_id, info)| info.zone.as_str()) - .collect::>(); - let n_zones = zones.len(); - - // Prepare ring - let mut partitions: Vec> = partitions_idx - .iter() - .map(|_i| Vec::new()) - .collect::>(); - - // Create MagLev priority queues for each node - let mut queues = config - .members - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(node_id, node_info)| { - let mut parts = partitions_idx - .iter() - .map(|i| { - let part_data = - [&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat(); - (*i, fasthash(&part_data[..])) - }) - .collect::>(); - parts.sort_by_key(|(_i, h)| *h); - let parts_i = parts.iter().map(|(i, _h)| *i).collect::>(); - (node_id, node_info, parts_i, 0) - }) - .collect::>(); - - let max_capacity = config - .members - .iter() - .filter_map(|(_, node_info)| node_info.capacity) - .fold(0, std::cmp::max); - - assert!(replication_factor <= MAX_REPLICATION); - - // Fill up ring - for rep in 0..replication_factor { - queues.sort_by_key(|(ni, _np, _q, _p)| { - let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat(); - fasthash(&queue_data[..]) - }); - - for (_, _, _, pos) in queues.iter_mut() { - *pos = 0; - } - - let mut remaining = partitions_idx.len(); - while remaining > 0 { - let remaining0 = remaining; - for i_round in 0..max_capacity { - for (node_id, node_info, q, pos) in queues.iter_mut() { - if i_round >= node_info.capacity.unwrap() { - continue; - } - for (pos2, &qv) in q.iter().enumerate().skip(*pos) { - if partitions[qv].len() != rep { - continue; - } - let p_zns = partitions[qv] - .iter() - .map(|(_id, info)| info.zone.as_str()) - .collect::>(); - if (p_zns.len() < n_zones && !p_zns.contains(&node_info.zone.as_str())) - || (p_zns.len() == n_zones - && !partitions[qv].iter().any(|(id, _i)| id == node_id)) - { - partitions[qv].push((node_id, node_info)); - remaining -= 1; - *pos = pos2 + 1; - break; - } - } - } - } - if remaining == remaining0 { - // No progress made, exit - warn!("Could not build ring, not enough nodes configured."); - return Self { - replication_factor, - config, - nodes: vec![], - ring: vec![], - }; - } - } + pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self { + if replication_factor != layout.replication_factor { + warn!("Could not build ring: replication factor does not match between local configuration and network role assignation."); + return Self::empty(layout, replication_factor); } - // Make a canonical order for nodes - let nodes = config - .members - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(id, _)| *id) - .collect::>(); - let nodes_rev = nodes - .iter() - .enumerate() - .map(|(i, id)| (*id, i as CompactNodeType)) - .collect::>(); + if layout.ring_assignation_data.len() != replication_factor * (1 << PARTITION_BITS) { + warn!("Could not build ring: network role assignation data has invalid length"); + return Self::empty(layout, replication_factor); + } - let ring = partitions - .iter() - .enumerate() - .map(|(i, nodes)| { + let nodes = layout.node_id_vec.clone(); + let ring = (0..(1 << PARTITION_BITS)) + .map(|i| { let top = (i as u16) << (16 - PARTITION_BITS); - let nodes = nodes - .iter() - .map(|(id, _info)| *nodes_rev.get(id).unwrap()) - .collect::>(); - assert!(nodes.len() == replication_factor); let mut nodes_buf = [0u8; MAX_REPLICATION]; - nodes_buf[..replication_factor].copy_from_slice(&nodes[..]); + nodes_buf[..replication_factor].copy_from_slice( + &layout.ring_assignation_data + [replication_factor * i..replication_factor * (i + 1)], + ); RingEntry { hash_prefix: top, nodes_buf, @@ -237,12 +89,21 @@ impl Ring { Self { replication_factor, - config, + layout, nodes, ring, } } + fn empty(layout: ClusterLayout, replication_factor: usize) -> Self { + Self { + replication_factor, + layout, + nodes: vec![], + ring: vec![], + } + } + /// Get the partition in which data would fall on pub fn partition_of(&self, position: &Hash) -> Partition { let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index df0e94f8..68bdfc4f 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -225,7 +225,7 @@ impl RpcHelper { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); let ring: Arc = self.0.ring.borrow().clone(); - let our_zone = match ring.config.members.get(&self.0.our_node_id) { + let our_zone = match ring.layout.node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", }; @@ -238,7 +238,7 @@ impl RpcHelper { // and within a same zone we priorize nodes with the lowest latency. let mut requests = requests .map(|(to, fut)| { - let peer_zone = match ring.config.members.get(&to) { + let peer_zone = match ring.layout.node_role(&to) { Some(pc) => &pc.zone, None => "", }; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 3f5f7fb1..aa8947ea 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -23,12 +23,13 @@ use netapp::{NetApp, NetworkKey, NodeID, NodeKey}; use garage_util::background::BackgroundRunner; use garage_util::config::Config; -use garage_util::data::Uuid; +use garage_util::data::*; use garage_util::error::*; use garage_util::persister::Persister; use garage_util::time::*; use crate::consul::*; +use crate::layout::*; use crate::ring::*; use crate::rpc_helper::*; @@ -48,13 +49,13 @@ pub enum SystemRpc { Ok, /// Request to connect to a specific node (in @: format) Connect(String), - /// Ask other node its config. Answered with AdvertiseConfig - PullConfig, + /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout + PullClusterLayout, /// Advertise Garage status. Answered with another AdvertiseStatus. /// Exchanged with every node on a regular basis. AdvertiseStatus(NodeStatus), - /// Advertisement of nodes config. Sent spontanously or in response to PullConfig - AdvertiseConfig(NetworkConfig), + /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout + AdvertiseClusterLayout(ClusterLayout), /// Get known nodes states GetKnownNodes, /// Return known nodes @@ -70,7 +71,7 @@ pub struct System { /// The id of this node pub id: Uuid, - persist_config: Persister, + persist_cluster_layout: Persister, persist_peer_list: Persister>, local_status: ArcSwap, @@ -103,8 +104,10 @@ pub struct NodeStatus { pub hostname: String, /// Replication factor configured on the node pub replication_factor: usize, - /// Configuration version - pub config_version: u64, + /// Cluster layout version + pub cluster_layout_version: u64, + /// Hash of cluster layout staging data + pub cluster_layout_staging_hash: Hash, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -187,17 +190,17 @@ impl System { gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); info!("Node public key: {}", hex::encode(&node_key.public_key())); - let persist_config = Persister::new(&config.metadata_dir, "network_config"); + let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout"); let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); - let net_config = match persist_config.load() { + let cluster_layout = match persist_cluster_layout.load() { Ok(x) => x, Err(e) => { info!( - "No valid previous network configuration stored ({}), starting fresh.", + "No valid previous cluster layout stored ({}), starting fresh.", e ); - NetworkConfig::new() + ClusterLayout::new(replication_factor) } }; @@ -206,10 +209,11 @@ impl System { .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - config_version: net_config.version, + cluster_layout_version: cluster_layout.version, + cluster_layout_staging_hash: cluster_layout.staging_hash, }; - let ring = Ring::new(net_config, replication_factor); + let ring = Ring::new(cluster_layout, replication_factor); let (update_ring, ring) = watch::channel(Arc::new(ring)); if let Some(addr) = config.rpc_public_addr { @@ -229,7 +233,7 @@ impl System { let sys = Arc::new(System { id: netapp.id.into(), - persist_config, + persist_cluster_layout, persist_peer_list, local_status: ArcSwap::new(Arc::new(local_status)), node_status: RwLock::new(HashMap::new()), @@ -292,12 +296,12 @@ impl System { } /// Save network configuration to disc - async fn save_network_config(self: Arc) -> Result<(), Error> { + async fn save_cluster_layout(self: Arc) -> Result<(), Error> { let ring: Arc = self.ring.borrow().clone(); - self.persist_config - .save_async(&ring.config) + self.persist_cluster_layout + .save_async(&ring.layout) .await - .expect("Cannot save current cluster configuration"); + .expect("Cannot save current cluster layout"); Ok(()) } @@ -305,7 +309,8 @@ impl System { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); let ring = self.ring.borrow(); - new_si.config_version = ring.config.version; + new_si.cluster_layout_version = ring.layout.version; + new_si.cluster_layout_staging_hash = ring.layout.staging_hash; self.local_status.swap(Arc::new(new_si)); } @@ -337,9 +342,9 @@ impl System { ))); } - fn handle_pull_config(&self) -> SystemRpc { + fn handle_pull_cluster_layout(&self) -> SystemRpc { let ring = self.ring.borrow().clone(); - SystemRpc::AdvertiseConfig(ring.config.clone()) + SystemRpc::AdvertiseClusterLayout(ring.layout.clone()) } fn handle_get_known_nodes(&self) -> SystemRpc { @@ -360,7 +365,8 @@ impl System { .unwrap_or(NodeStatus { hostname: "?".to_string(), replication_factor: 0, - config_version: 0, + cluster_layout_version: 0, + cluster_layout_staging_hash: Hash::from([0u8; 32]), }), }) .collect::>(); @@ -381,10 +387,12 @@ impl System { std::process::exit(1); } - if info.config_version > local_info.config_version { + if info.cluster_layout_version > local_info.cluster_layout_version + || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash + { let self2 = self.clone(); self.background.spawn_cancellable(async move { - self2.pull_config(from).await; + self2.pull_cluster_layout(from).await; Ok(()) }); } @@ -397,32 +405,39 @@ impl System { Ok(SystemRpc::Ok) } - async fn handle_advertise_config( + async fn handle_advertise_cluster_layout( self: Arc, - adv: &NetworkConfig, + adv: &ClusterLayout, ) -> Result { let update_ring = self.update_ring.lock().await; - let ring: Arc = self.ring.borrow().clone(); + let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); - if adv.version > ring.config.version { - let ring = Ring::new(adv.clone(), self.replication_factor); + let prev_layout_check = layout.check(); + if layout.merge(adv) { + if prev_layout_check && !layout.check() { + error!("New cluster layout is invalid, discarding."); + return Err(Error::Message( + "New cluster layout is invalid, discarding.".into(), + )); + } + + let ring = Ring::new(layout.clone(), self.replication_factor); update_ring.send(Arc::new(ring))?; drop(update_ring); let self2 = self.clone(); - let adv = adv.clone(); self.background.spawn_cancellable(async move { self2 .rpc .broadcast( &self2.system_endpoint, - SystemRpc::AdvertiseConfig(adv), + SystemRpc::AdvertiseClusterLayout(layout), RequestStrategy::with_priority(PRIO_HIGH), ) .await; Ok(()) }); - self.background.spawn(self.clone().save_network_config()); + self.background.spawn(self.clone().save_cluster_layout()); } Ok(SystemRpc::Ok) @@ -456,14 +471,15 @@ impl System { }; while !*stop_signal.borrow() { - let not_configured = self.ring.borrow().config.members.is_empty(); + let not_configured = !self.ring.borrow().layout.check(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; + let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self .fullmesh .get_peer_list() .iter() .filter(|p| p.is_up()) - .count() != self.ring.borrow().config.members.len(); + .count() != expected_n_nodes; if not_configured || no_peers || bad_peers { info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers); @@ -533,18 +549,18 @@ impl System { self.persist_peer_list.save_async(&peer_list).await } - async fn pull_config(self: Arc, peer: Uuid) { + async fn pull_cluster_layout(self: Arc, peer: Uuid) { let resp = self .rpc .call( &self.system_endpoint, peer, - SystemRpc::PullConfig, + SystemRpc::PullClusterLayout, RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), ) .await; - if let Ok(SystemRpc::AdvertiseConfig(config)) = resp { - let _: Result<_, _> = self.handle_advertise_config(&config).await; + if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { + let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; } } } @@ -554,9 +570,11 @@ impl EndpointHandler for System { async fn handle(self: &Arc, msg: &SystemRpc, from: NodeID) -> Result { match msg { SystemRpc::Connect(node) => self.handle_connect(node).await, - SystemRpc::PullConfig => Ok(self.handle_pull_config()), + SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()), SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await, - SystemRpc::AdvertiseConfig(adv) => self.clone().handle_advertise_config(adv).await, + SystemRpc::AdvertiseClusterLayout(adv) => { + self.clone().handle_advertise_cluster_layout(adv).await + } SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), _ => Err(Error::BadRpc("Unexpected RPC message".to_string())), } diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 616bf275..dc37f12c 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_table" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "Table sharding and replication engine (DynamoDB-like) for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" @@ -13,8 +14,8 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_rpc = { version = "0.4.0", path = "../rpc" } -garage_util = { version = "0.4.0", path = "../util" } +garage_rpc = { version = "0.5.0", path = "../rpc" } +garage_util = { version = "0.5.0", path = "../util" } async-trait = "0.1.7" bytes = "1.0" diff --git a/src/table/lib.rs b/src/table/lib.rs index 53d2c93b..d6c19f1b 100644 --- a/src/table/lib.rs +++ b/src/table/lib.rs @@ -4,7 +4,6 @@ #[macro_use] extern crate log; -pub mod crdt; pub mod schema; pub mod util; @@ -18,3 +17,7 @@ pub mod table; pub use schema::*; pub use table::*; pub use util::*; + +pub mod crdt { + pub use garage_util::crdt::*; +} diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 8f01fbdd..18682ace 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -28,10 +28,10 @@ impl TableReplication for TableFullReplication { fn write_nodes(&self, _hash: &Hash) -> Vec { let ring = self.system.ring.borrow(); - ring.config.members.keys().cloned().collect::>() + ring.layout.node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.ring.borrow().config.members.len(); + let nmembers = self.system.ring.borrow().layout.node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index f2a001fa..e33f8a66 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_util" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "Utility crate for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" diff --git a/src/table/crdt/bool.rs b/src/util/crdt/bool.rs similarity index 100% rename from src/table/crdt/bool.rs rename to src/util/crdt/bool.rs diff --git a/src/table/crdt/crdt.rs b/src/util/crdt/crdt.rs similarity index 98% rename from src/table/crdt/crdt.rs rename to src/util/crdt/crdt.rs index a8f1b9aa..9b5f230d 100644 --- a/src/table/crdt/crdt.rs +++ b/src/util/crdt/crdt.rs @@ -1,4 +1,4 @@ -use garage_util::data::*; +use crate::data::*; /// Definition of a CRDT - all CRDT Rust types implement this. /// diff --git a/src/table/crdt/lww.rs b/src/util/crdt/lww.rs similarity index 94% rename from src/table/crdt/lww.rs rename to src/util/crdt/lww.rs index be197d88..43d13f27 100644 --- a/src/table/crdt/lww.rs +++ b/src/util/crdt/lww.rs @@ -1,6 +1,8 @@ +use std::cmp::Ordering; + use serde::{Deserialize, Serialize}; -use garage_util::time::now_msec; +use crate::time::now_msec; use crate::crdt::crdt::*; @@ -104,11 +106,15 @@ where T: Clone + Crdt, { fn merge(&mut self, other: &Self) { - if other.ts > self.ts { - self.ts = other.ts; - self.v = other.v.clone(); - } else if other.ts == self.ts { - self.v.merge(&other.v); + match other.ts.cmp(&self.ts) { + Ordering::Greater => { + self.ts = other.ts; + self.v = other.v.clone(); + } + Ordering::Equal => { + self.v.merge(&other.v); + } + Ordering::Less => (), } } } diff --git a/src/table/crdt/lww_map.rs b/src/util/crdt/lww_map.rs similarity index 95% rename from src/table/crdt/lww_map.rs rename to src/util/crdt/lww_map.rs index fb25fd46..3e9aba79 100644 --- a/src/table/crdt/lww_map.rs +++ b/src/util/crdt/lww_map.rs @@ -1,6 +1,8 @@ +use std::cmp::Ordering; + use serde::{Deserialize, Serialize}; -use garage_util::time::now_msec; +use crate::time::now_msec; use crate::crdt::crdt::*; @@ -135,11 +137,15 @@ where match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) { Ok(i) => { let (_, ts1, _v1) = &self.vals[i]; - if ts2 > ts1 { - self.vals[i].1 = *ts2; - self.vals[i].2 = v2.clone(); - } else if ts1 == ts2 { - self.vals[i].2.merge(v2); + match ts2.cmp(ts1) { + Ordering::Greater => { + self.vals[i].1 = *ts2; + self.vals[i].2 = v2.clone(); + } + Ordering::Equal => { + self.vals[i].2.merge(v2); + } + Ordering::Less => (), } } Err(i) => { diff --git a/src/table/crdt/map.rs b/src/util/crdt/map.rs similarity index 100% rename from src/table/crdt/map.rs rename to src/util/crdt/map.rs diff --git a/src/table/crdt/mod.rs b/src/util/crdt/mod.rs similarity index 100% rename from src/table/crdt/mod.rs rename to src/util/crdt/mod.rs diff --git a/src/util/lib.rs b/src/util/lib.rs index 478b9ea4..64874095 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -5,6 +5,7 @@ extern crate log; pub mod background; pub mod config; +pub mod crdt; pub mod data; pub mod error; pub mod persister; diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 634ce282..72701c90 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_web" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat ", "Quentin Dufour "] edition = "2018" license = "AGPL-3.0" description = "S3-like website endpoint crate for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" @@ -13,10 +14,10 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_api = { version = "0.4.0", path = "../api" } -garage_model = { version = "0.4.0", path = "../model" } -garage_util = { version = "0.4.0", path = "../util" } -garage_table = { version = "0.4.0", path = "../table" } +garage_api = { version = "0.5.0", path = "../api" } +garage_model = { version = "0.5.0", path = "../model" } +garage_util = { version = "0.5.0", path = "../util" } +garage_table = { version = "0.5.0", path = "../table" } err-derive = "0.3" log = "0.4"