forked from Deuxfleurs/garage
Merge pull request 'Garage v1.0' (#683) from next-0.10 into main
Reviewed-on: Deuxfleurs/garage#683
This commit is contained in:
commit
1779fd40c0
124 changed files with 7724 additions and 3830 deletions
|
@ -5,6 +5,7 @@ when:
|
|||
- pull_request
|
||||
- deployment
|
||||
- cron
|
||||
- manual
|
||||
|
||||
steps:
|
||||
- name: check formatting
|
||||
|
@ -33,8 +34,6 @@ steps:
|
|||
- ./result/bin/garage_util-*
|
||||
- ./result/bin/garage_web-*
|
||||
- ./result/bin/garage-*
|
||||
- GARAGE_TEST_INTEGRATION_DB_ENGINE=sled ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false)
|
||||
- nix-shell --attr ci --run "killall -9 garage" || true
|
||||
- GARAGE_TEST_INTEGRATION_DB_ENGINE=lmdb ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false)
|
||||
- nix-shell --attr ci --run "killall -9 garage" || true
|
||||
- GARAGE_TEST_INTEGRATION_DB_ENGINE=sqlite ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false)
|
||||
|
|
179
Cargo.lock
generated
179
Cargo.lock
generated
|
@ -17,6 +17,41 @@ version = "1.0.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "aead"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"
|
||||
dependencies = [
|
||||
"crypto-common",
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aes"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cipher",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aes-gcm"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"
|
||||
dependencies = [
|
||||
"aead",
|
||||
"aes",
|
||||
"cipher",
|
||||
"ctr",
|
||||
"ghash",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.7"
|
||||
|
@ -761,6 +796,16 @@ dependencies = [
|
|||
"windows-targets 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cipher"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
|
||||
dependencies = [
|
||||
"crypto-common",
|
||||
"inout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.34.0"
|
||||
|
@ -860,9 +905,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.3.2"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
|
||||
checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
@ -876,15 +921,6 @@ dependencies = [
|
|||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-queue"
|
||||
version = "0.3.11"
|
||||
|
@ -929,9 +965,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"rand_core",
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctr"
|
||||
version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"
|
||||
dependencies = [
|
||||
"cipher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.5"
|
||||
|
@ -1167,16 +1213,6 @@ dependencies = [
|
|||
name = "format_table"
|
||||
version = "0.1.1"
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.30"
|
||||
|
@ -1266,18 +1302,9 @@ dependencies = [
|
|||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "garage"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"assert-json-diff",
|
||||
"async-trait",
|
||||
|
@ -1319,6 +1346,7 @@ dependencies = [
|
|||
"serde",
|
||||
"serde_bytes",
|
||||
"serde_json",
|
||||
"sha1",
|
||||
"sha2",
|
||||
"static_init",
|
||||
"structopt",
|
||||
|
@ -1332,13 +1360,17 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_api"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"aes-gcm",
|
||||
"argon2",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"base64 0.21.7",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"crc32c",
|
||||
"crc32fast",
|
||||
"crypto-common",
|
||||
"err-derive",
|
||||
"form_urlencoded",
|
||||
|
@ -1372,16 +1404,18 @@ dependencies = [
|
|||
"serde",
|
||||
"serde_bytes",
|
||||
"serde_json",
|
||||
"sha1",
|
||||
"sha2",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util 0.7.10",
|
||||
"tracing",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "garage_block"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
|
@ -1408,7 +1442,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_db"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"err-derive",
|
||||
"heed",
|
||||
|
@ -1417,13 +1451,12 @@ dependencies = [
|
|||
"r2d2",
|
||||
"r2d2_sqlite",
|
||||
"rusqlite",
|
||||
"sled",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "garage_model"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"async-trait",
|
||||
|
@ -1440,6 +1473,7 @@ dependencies = [
|
|||
"garage_table",
|
||||
"garage_util",
|
||||
"hex",
|
||||
"http 1.0.0",
|
||||
"opentelemetry",
|
||||
"parse_duration",
|
||||
"rand",
|
||||
|
@ -1452,7 +1486,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_net"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"async-trait",
|
||||
|
@ -1478,7 +1512,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_rpc"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"async-trait",
|
||||
|
@ -1513,7 +1547,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_table"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"async-trait",
|
||||
|
@ -1535,7 +1569,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_util"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"async-trait",
|
||||
|
@ -1569,7 +1603,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "garage_web"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"err-derive",
|
||||
"futures",
|
||||
|
@ -1618,6 +1652,16 @@ dependencies = [
|
|||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ghash"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"
|
||||
dependencies = [
|
||||
"opaque-debug",
|
||||
"polyval",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.28.1"
|
||||
|
@ -2067,6 +2111,15 @@ dependencies = [
|
|||
"hashbrown 0.14.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inout"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "instant"
|
||||
version = "0.1.12"
|
||||
|
@ -2647,6 +2700,12 @@ version = "1.19.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
|
||||
[[package]]
|
||||
name = "opaque-debug"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.5"
|
||||
|
@ -2984,6 +3043,18 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "polyval"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"opaque-debug",
|
||||
"universal-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
|
@ -3770,22 +3841,6 @@ dependencies = [
|
|||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sled"
|
||||
version = "0.34.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"fs2",
|
||||
"fxhash",
|
||||
"libc",
|
||||
"log",
|
||||
"parking_lot 0.11.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.13.1"
|
||||
|
@ -4445,6 +4500,16 @@ version = "0.2.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
|
||||
|
||||
[[package]]
|
||||
name = "universal-hash"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"
|
||||
dependencies = [
|
||||
"crypto-common",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unsafe-libyaml"
|
||||
version = "0.2.10"
|
||||
|
|
347
Cargo.nix
347
Cargo.nix
|
@ -34,7 +34,7 @@ args@{
|
|||
ignoreLockHash,
|
||||
}:
|
||||
let
|
||||
nixifiedLockHash = "9ea4045dd09421583b69811f95797af9c1f16239ecce89d8a6f5a9319d7d8526";
|
||||
nixifiedLockHash = "1ccd5eb25a83962821e0e9da4ce6df31717b2b97a5b3a0c80c9e0e0759710143";
|
||||
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
||||
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
||||
lockHashIgnored = if ignoreLockHash
|
||||
|
@ -58,17 +58,17 @@ in
|
|||
{
|
||||
cargo2nixVersion = "0.11.0";
|
||||
workspace = {
|
||||
garage_db = rustPackages.unknown.garage_db."0.9.4";
|
||||
garage_util = rustPackages.unknown.garage_util."0.9.4";
|
||||
garage_net = rustPackages.unknown.garage_net."0.9.4";
|
||||
garage_rpc = rustPackages.unknown.garage_rpc."0.9.4";
|
||||
garage_db = rustPackages.unknown.garage_db."1.0.0";
|
||||
garage_util = rustPackages.unknown.garage_util."1.0.0";
|
||||
garage_net = rustPackages.unknown.garage_net."1.0.0";
|
||||
garage_rpc = rustPackages.unknown.garage_rpc."1.0.0";
|
||||
format_table = rustPackages.unknown.format_table."0.1.1";
|
||||
garage_table = rustPackages.unknown.garage_table."0.9.4";
|
||||
garage_block = rustPackages.unknown.garage_block."0.9.4";
|
||||
garage_model = rustPackages.unknown.garage_model."0.9.4";
|
||||
garage_api = rustPackages.unknown.garage_api."0.9.4";
|
||||
garage_web = rustPackages.unknown.garage_web."0.9.4";
|
||||
garage = rustPackages.unknown.garage."0.9.4";
|
||||
garage_table = rustPackages.unknown.garage_table."1.0.0";
|
||||
garage_block = rustPackages.unknown.garage_block."1.0.0";
|
||||
garage_model = rustPackages.unknown.garage_model."1.0.0";
|
||||
garage_api = rustPackages.unknown.garage_api."1.0.0";
|
||||
garage_web = rustPackages.unknown.garage_web."1.0.0";
|
||||
garage = rustPackages.unknown.garage."1.0.0";
|
||||
k2v-client = rustPackages.unknown.k2v-client."0.0.4";
|
||||
};
|
||||
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
|
||||
|
@ -88,6 +88,58 @@ in
|
|||
src = fetchCratesIo { inherit name version; sha256 = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"; };
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".aead."0.5.2" = overridableMkRustCrate (profileName: rec {
|
||||
name = "aead";
|
||||
version = "0.5.2";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"; };
|
||||
features = builtins.concatLists [
|
||||
[ "alloc" ]
|
||||
[ "getrandom" ]
|
||||
[ "rand_core" ]
|
||||
[ "stream" ]
|
||||
];
|
||||
dependencies = {
|
||||
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
|
||||
generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".aes."0.8.4" = overridableMkRustCrate (profileName: rec {
|
||||
name = "aes";
|
||||
version = "0.8.4";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"; };
|
||||
dependencies = {
|
||||
cfg_if = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cfg-if."1.0.0" { inherit profileName; }).out;
|
||||
cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out;
|
||||
${ if hostPlatform.parsed.cpu.name == "aarch64" || hostPlatform.parsed.cpu.name == "x86_64" || hostPlatform.parsed.cpu.name == "i686" then "cpufeatures" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cpufeatures."0.2.12" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".aes-gcm."0.10.3" = overridableMkRustCrate (profileName: rec {
|
||||
name = "aes-gcm";
|
||||
version = "0.10.3";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"; };
|
||||
features = builtins.concatLists [
|
||||
[ "aes" ]
|
||||
[ "alloc" ]
|
||||
[ "default" ]
|
||||
[ "getrandom" ]
|
||||
[ "rand_core" ]
|
||||
[ "stream" ]
|
||||
];
|
||||
dependencies = {
|
||||
aead = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aead."0.5.2" { inherit profileName; }).out;
|
||||
aes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aes."0.8.4" { inherit profileName; }).out;
|
||||
cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out;
|
||||
ctr = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".ctr."0.9.2" { inherit profileName; }).out;
|
||||
ghash = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".ghash."0.5.1" { inherit profileName; }).out;
|
||||
subtle = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".subtle."2.5.0" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".ahash."0.8.7" = overridableMkRustCrate (profileName: rec {
|
||||
name = "ahash";
|
||||
version = "0.8.7";
|
||||
|
@ -622,7 +674,7 @@ in
|
|||
aws_smithy_types = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aws-smithy-types."1.1.4" { inherit profileName; }).out;
|
||||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out;
|
||||
crc32c = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32c."0.6.4" { inherit profileName; }).out;
|
||||
crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out;
|
||||
crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.11" { inherit profileName; }).out;
|
||||
http_body = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http-body."0.4.6" { inherit profileName; }).out;
|
||||
|
@ -642,7 +694,7 @@ in
|
|||
dependencies = {
|
||||
aws_smithy_types = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aws-smithy-types."1.1.4" { inherit profileName; }).out;
|
||||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out;
|
||||
crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out;
|
||||
crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
|
@ -1085,6 +1137,17 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" = overridableMkRustCrate (profileName: rec {
|
||||
name = "cipher";
|
||||
version = "0.4.4";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"; };
|
||||
dependencies = {
|
||||
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
|
||||
inout = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".inout."0.1.3" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".clap."2.34.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "clap";
|
||||
version = "2.34.0";
|
||||
|
@ -1224,11 +1287,11 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" = overridableMkRustCrate (profileName: rec {
|
||||
"registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "crc32fast";
|
||||
version = "1.3.2";
|
||||
version = "1.4.0";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"; };
|
||||
src = fetchCratesIo { inherit name version; sha256 = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa"; };
|
||||
features = builtins.concatLists [
|
||||
[ "default" ]
|
||||
[ "std" ]
|
||||
|
@ -1252,21 +1315,6 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".crossbeam-epoch."0.9.18" = overridableMkRustCrate (profileName: rec {
|
||||
name = "crossbeam-epoch";
|
||||
version = "0.9.18";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"; };
|
||||
features = builtins.concatLists [
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "alloc")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "std")
|
||||
];
|
||||
dependencies = {
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_utils" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-utils."0.8.19" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".crossbeam-queue."0.3.11" = overridableMkRustCrate (profileName: rec {
|
||||
name = "crossbeam-queue";
|
||||
version = "0.3.11";
|
||||
|
@ -1288,7 +1336,6 @@ in
|
|||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"; };
|
||||
features = builtins.concatLists [
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default")
|
||||
[ "std" ]
|
||||
];
|
||||
});
|
||||
|
@ -1333,14 +1380,27 @@ in
|
|||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"; };
|
||||
features = builtins.concatLists [
|
||||
[ "getrandom" ]
|
||||
[ "rand_core" ]
|
||||
[ "std" ]
|
||||
];
|
||||
dependencies = {
|
||||
generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out;
|
||||
rand_core = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand_core."0.6.4" { inherit profileName; }).out;
|
||||
typenum = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".typenum."1.17.0" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".ctr."0.9.2" = overridableMkRustCrate (profileName: rec {
|
||||
name = "ctr";
|
||||
version = "0.9.2";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"; };
|
||||
dependencies = {
|
||||
cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".darling."0.20.5" = overridableMkRustCrate (profileName: rec {
|
||||
name = "darling";
|
||||
version = "0.20.5";
|
||||
|
@ -1699,17 +1759,6 @@ in
|
|||
src = fetchCrateLocal (workspaceSrc + "/src/format-table");
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".fs2."0.4.3" = overridableMkRustCrate (profileName: rec {
|
||||
name = "fs2";
|
||||
version = "0.4.3";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"; };
|
||||
dependencies = {
|
||||
${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && hostPlatform.isUnix then "libc" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.153" { inherit profileName; }).out;
|
||||
${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && hostPlatform.isWindows then "winapi" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".winapi."0.3.9" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" = overridableMkRustCrate (profileName: rec {
|
||||
name = "futures";
|
||||
version = "0.3.30";
|
||||
|
@ -1861,19 +1910,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".fxhash."0.2.1" = overridableMkRustCrate (profileName: rec {
|
||||
name = "fxhash";
|
||||
version = "0.2.1";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"; };
|
||||
dependencies = {
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "byteorder" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".byteorder."1.5.0" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"unknown".garage."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/garage");
|
||||
features = builtins.concatLists [
|
||||
|
@ -1887,7 +1926,6 @@ in
|
|||
(lib.optional (rootFeatures' ? "garage/opentelemetry-otlp" || rootFeatures' ? "garage/telemetry-otlp") "opentelemetry-otlp")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/opentelemetry-prometheus") "opentelemetry-prometheus")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/prometheus") "prometheus")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled") "sled")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite") "sqlite")
|
||||
(lib.optional (rootFeatures' ? "garage/syslog") "syslog")
|
||||
(lib.optional (rootFeatures' ? "garage/syslog" || rootFeatures' ? "garage/syslog-tracing") "syslog-tracing")
|
||||
|
@ -1902,15 +1940,15 @@ in
|
|||
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
|
||||
garage_api = (rustPackages."unknown".garage_api."0.9.4" { inherit profileName; }).out;
|
||||
garage_block = (rustPackages."unknown".garage_block."0.9.4" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.9.4" { inherit profileName; }).out;
|
||||
garage_model = (rustPackages."unknown".garage_model."0.9.4" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."0.9.4" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.4" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_web = (rustPackages."unknown".garage_web."0.9.4" { inherit profileName; }).out;
|
||||
garage_api = (rustPackages."unknown".garage_api."1.0.0" { inherit profileName; }).out;
|
||||
garage_block = (rustPackages."unknown".garage_block."1.0.0" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out;
|
||||
garage_model = (rustPackages."unknown".garage_model."1.0.0" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
garage_web = (rustPackages."unknown".garage_web."1.0.0" { inherit profileName; }).out;
|
||||
git_version = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.9" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
sodiumoxide = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; }).out;
|
||||
|
@ -1922,6 +1960,7 @@ in
|
|||
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
|
||||
serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out;
|
||||
serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out;
|
||||
sha1 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha1."0.10.6" { inherit profileName; }).out;
|
||||
structopt = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".structopt."0.3.26" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/syslog" || rootFeatures' ? "garage/syslog-tracing" then "syslog_tracing" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".syslog-tracing."0.3.0" { inherit profileName; }).out;
|
||||
timeago = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".timeago."0.4.2" { inherit profileName; }).out;
|
||||
|
@ -1949,9 +1988,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_api."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_api."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_api";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/api");
|
||||
features = builtins.concatLists [
|
||||
|
@ -1961,22 +2000,26 @@ in
|
|||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage_api/metrics" || rootFeatures' ? "garage_api/prometheus") "prometheus")
|
||||
];
|
||||
dependencies = {
|
||||
aes_gcm = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aes-gcm."0.10.3" { inherit profileName; }).out;
|
||||
argon2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".argon2."0.5.3" { inherit profileName; }).out;
|
||||
async_compression = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".async-compression."0.4.6" { inherit profileName; }).out;
|
||||
async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.77" { profileName = "__noProfile"; }).out;
|
||||
base64 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".base64."0.21.7" { inherit profileName; }).out;
|
||||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out;
|
||||
chrono = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".chrono."0.4.33" { inherit profileName; }).out;
|
||||
crc32c = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32c."0.6.4" { inherit profileName; }).out;
|
||||
crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" { inherit profileName; }).out;
|
||||
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
|
||||
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
||||
form_urlencoded = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".form_urlencoded."1.2.1" { inherit profileName; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
|
||||
garage_block = (rustPackages."unknown".garage_block."0.9.4" { inherit profileName; }).out;
|
||||
garage_model = (rustPackages."unknown".garage_model."0.9.4" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."0.9.4" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.4" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_block = (rustPackages."unknown".garage_block."1.0.0" { inherit profileName; }).out;
|
||||
garage_model = (rustPackages."unknown".garage_model."1.0.0" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
hmac = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.12.1" { inherit profileName; }).out;
|
||||
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
|
||||
|
@ -1999,17 +2042,19 @@ in
|
|||
serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out;
|
||||
serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out;
|
||||
serde_json = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_json."1.0.113" { inherit profileName; }).out;
|
||||
sha1 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha1."0.10.6" { inherit profileName; }).out;
|
||||
sha2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha2."0.10.8" { inherit profileName; }).out;
|
||||
tokio = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.36.0" { inherit profileName; }).out;
|
||||
tokio_stream = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.14" { inherit profileName; }).out;
|
||||
tokio_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-util."0.7.10" { inherit profileName; }).out;
|
||||
tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out;
|
||||
url = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".url."2.5.0" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"unknown".garage_block."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_block."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_block";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/block");
|
||||
features = builtins.concatLists [
|
||||
|
@ -2023,11 +2068,11 @@ in
|
|||
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.9.4" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."0.9.4" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.4" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
|
||||
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
|
||||
|
@ -2040,9 +2085,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_db."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_db."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_db";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/db");
|
||||
features = builtins.concatLists [
|
||||
|
@ -2053,7 +2098,6 @@ in
|
|||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "r2d2")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2_sqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "r2d2_sqlite")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "rusqlite")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "sled")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite")
|
||||
];
|
||||
dependencies = {
|
||||
|
@ -2063,7 +2107,6 @@ in
|
|||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "r2d2" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".r2d2."0.8.10" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2_sqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "r2d2_sqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".r2d2_sqlite."0.24.0" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "rusqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rusqlite."0.31.0" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "sled" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sled."0.34.7" { inherit profileName; }).out;
|
||||
tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out;
|
||||
};
|
||||
devDependencies = {
|
||||
|
@ -2071,16 +2114,15 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_model."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_model."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_model";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/model");
|
||||
features = builtins.concatLists [
|
||||
(lib.optional (rootFeatures' ? "garage_model/default") "default")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/k2v" || rootFeatures' ? "garage_api/k2v" || rootFeatures' ? "garage_model/k2v") "k2v")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb") "lmdb")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "sled")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite")
|
||||
];
|
||||
dependencies = {
|
||||
|
@ -2092,13 +2134,14 @@ in
|
|||
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
|
||||
garage_block = (rustPackages."unknown".garage_block."0.9.4" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.9.4" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."0.9.4" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.4" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_block = (rustPackages."unknown".garage_block."1.0.0" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
|
||||
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
|
||||
parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out;
|
||||
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
|
||||
|
@ -2110,9 +2153,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_net."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_net."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_net";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/net");
|
||||
features = builtins.concatLists [
|
||||
|
@ -2147,9 +2190,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_rpc."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_rpc."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_rpc";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/rpc");
|
||||
features = builtins.concatLists [
|
||||
|
@ -2171,9 +2214,9 @@ in
|
|||
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.9.4" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
gethostname = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.4.3" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
itertools = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".itertools."0.12.1" { inherit profileName; }).out;
|
||||
|
@ -2195,9 +2238,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_table."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_table."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_table";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/table");
|
||||
dependencies = {
|
||||
|
@ -2206,9 +2249,9 @@ in
|
|||
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.9.4" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out;
|
||||
garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out;
|
||||
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
|
||||
|
@ -2220,9 +2263,9 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_util."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_util."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_util";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/util");
|
||||
features = builtins.concatLists [
|
||||
|
@ -2238,8 +2281,8 @@ in
|
|||
digest = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".digest."0.10.7" { inherit profileName; }).out;
|
||||
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."0.9.4" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."0.9.4" { inherit profileName; }).out;
|
||||
garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out;
|
||||
garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out;
|
||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||
hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out;
|
||||
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
|
||||
|
@ -2264,18 +2307,18 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"unknown".garage_web."0.9.4" = overridableMkRustCrate (profileName: rec {
|
||||
"unknown".garage_web."1.0.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "garage_web";
|
||||
version = "0.9.4";
|
||||
version = "1.0.0";
|
||||
registry = "unknown";
|
||||
src = fetchCrateLocal (workspaceSrc + "/src/web");
|
||||
dependencies = {
|
||||
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
|
||||
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
|
||||
garage_api = (rustPackages."unknown".garage_api."0.9.4" { inherit profileName; }).out;
|
||||
garage_model = (rustPackages."unknown".garage_model."0.9.4" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."0.9.4" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."0.9.4" { inherit profileName; }).out;
|
||||
garage_api = (rustPackages."unknown".garage_api."1.0.0" { inherit profileName; }).out;
|
||||
garage_model = (rustPackages."unknown".garage_model."1.0.0" { inherit profileName; }).out;
|
||||
garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out;
|
||||
garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out;
|
||||
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
|
||||
http_body_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http-body-util."0.1.0" { inherit profileName; }).out;
|
||||
hyper = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."1.1.0" { inherit profileName; }).out;
|
||||
|
@ -2329,6 +2372,17 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".ghash."0.5.1" = overridableMkRustCrate (profileName: rec {
|
||||
name = "ghash";
|
||||
version = "0.5.1";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"; };
|
||||
dependencies = {
|
||||
opaque_debug = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" { inherit profileName; }).out;
|
||||
polyval = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".polyval."0.6.2" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".gimli."0.28.1" = overridableMkRustCrate (profileName: rec {
|
||||
name = "gimli";
|
||||
version = "0.28.1";
|
||||
|
@ -2936,6 +2990,16 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".inout."0.1.3" = overridableMkRustCrate (profileName: rec {
|
||||
name = "inout";
|
||||
version = "0.1.3";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"; };
|
||||
dependencies = {
|
||||
generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".instant."0.1.12" = overridableMkRustCrate (profileName: rec {
|
||||
name = "instant";
|
||||
version = "0.1.12";
|
||||
|
@ -3785,6 +3849,13 @@ in
|
|||
];
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" = overridableMkRustCrate (profileName: rec {
|
||||
name = "opaque-debug";
|
||||
version = "0.3.1";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"; };
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".openssl-probe."0.1.5" = overridableMkRustCrate (profileName: rec {
|
||||
name = "openssl-probe";
|
||||
version = "0.1.5";
|
||||
|
@ -4244,6 +4315,19 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".polyval."0.6.2" = overridableMkRustCrate (profileName: rec {
|
||||
name = "polyval";
|
||||
version = "0.6.2";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"; };
|
||||
dependencies = {
|
||||
cfg_if = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cfg-if."1.0.0" { inherit profileName; }).out;
|
||||
${ if hostPlatform.parsed.cpu.name == "aarch64" || hostPlatform.parsed.cpu.name == "x86_64" || hostPlatform.parsed.cpu.name == "i686" then "cpufeatures" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cpufeatures."0.2.12" { inherit profileName; }).out;
|
||||
opaque_debug = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" { inherit profileName; }).out;
|
||||
universal_hash = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".universal-hash."0.5.1" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".powerfmt."0.2.0" = overridableMkRustCrate (profileName: rec {
|
||||
name = "powerfmt";
|
||||
version = "0.2.0";
|
||||
|
@ -5381,27 +5465,6 @@ in
|
|||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".sled."0.34.7" = overridableMkRustCrate (profileName: rec {
|
||||
name = "sled";
|
||||
version = "0.34.7";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935"; };
|
||||
features = builtins.concatLists [
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default")
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "no_metrics")
|
||||
];
|
||||
dependencies = {
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crc32fast" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_epoch" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-epoch."0.9.18" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_utils" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-utils."0.8.19" { inherit profileName; }).out;
|
||||
${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && (hostPlatform.parsed.kernel.name == "linux" || hostPlatform.parsed.kernel.name == "darwin" || hostPlatform.parsed.kernel.name == "windows") then "fs2" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".fs2."0.4.3" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "fxhash" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".fxhash."0.2.1" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "libc" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.153" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "log" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.20" { inherit profileName; }).out;
|
||||
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "parking_lot" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parking_lot."0.11.2" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".smallvec."1.13.1" = overridableMkRustCrate (profileName: rec {
|
||||
name = "smallvec";
|
||||
version = "1.13.1";
|
||||
|
@ -6369,6 +6432,17 @@ in
|
|||
];
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".universal-hash."0.5.1" = overridableMkRustCrate (profileName: rec {
|
||||
name = "universal-hash";
|
||||
version = "0.5.1";
|
||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||
src = fetchCratesIo { inherit name version; sha256 = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"; };
|
||||
dependencies = {
|
||||
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
|
||||
subtle = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".subtle."2.5.0" { inherit profileName; }).out;
|
||||
};
|
||||
});
|
||||
|
||||
"registry+https://github.com/rust-lang/crates.io-index".unsafe-libyaml."0.2.10" = overridableMkRustCrate (profileName: rec {
|
||||
name = "unsafe-libyaml";
|
||||
version = "0.2.10";
|
||||
|
@ -6649,7 +6723,6 @@ in
|
|||
[ "minwindef" ]
|
||||
[ "ntstatus" ]
|
||||
[ "processenv" ]
|
||||
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "processthreadsapi")
|
||||
[ "std" ]
|
||||
[ "synchapi" ]
|
||||
[ "sysinfoapi" ]
|
||||
|
|
23
Cargo.toml
23
Cargo.toml
|
@ -21,15 +21,15 @@ default-members = ["src/garage"]
|
|||
|
||||
# Internal Garage crates
|
||||
format_table = { version = "0.1.1", path = "src/format-table" }
|
||||
garage_api = { version = "0.9.4", path = "src/api" }
|
||||
garage_block = { version = "0.9.4", path = "src/block" }
|
||||
garage_db = { version = "0.9.4", path = "src/db", default-features = false }
|
||||
garage_model = { version = "0.9.4", path = "src/model", default-features = false }
|
||||
garage_net = { version = "0.9.4", path = "src/net" }
|
||||
garage_rpc = { version = "0.9.4", path = "src/rpc" }
|
||||
garage_table = { version = "0.9.4", path = "src/table" }
|
||||
garage_util = { version = "0.9.4", path = "src/util" }
|
||||
garage_web = { version = "0.9.4", path = "src/web" }
|
||||
garage_api = { version = "1.0.0", path = "src/api" }
|
||||
garage_block = { version = "1.0.0", path = "src/block" }
|
||||
garage_db = { version = "1.0.0", path = "src/db", default-features = false }
|
||||
garage_model = { version = "1.0.0", path = "src/model", default-features = false }
|
||||
garage_net = { version = "1.0.0", path = "src/net" }
|
||||
garage_rpc = { version = "1.0.0", path = "src/rpc" }
|
||||
garage_table = { version = "1.0.0", path = "src/table" }
|
||||
garage_util = { version = "1.0.0", path = "src/util" }
|
||||
garage_web = { version = "1.0.0", path = "src/web" }
|
||||
k2v-client = { version = "0.0.4", path = "src/k2v-client" }
|
||||
|
||||
# External crates from crates.io
|
||||
|
@ -43,6 +43,8 @@ bytes = "1.0"
|
|||
bytesize = "1.1"
|
||||
cfg-if = "1.0"
|
||||
chrono = "0.4"
|
||||
crc32fast = "1.4"
|
||||
crc32c = "0.6"
|
||||
crypto-common = "0.1"
|
||||
digest = "0.10"
|
||||
err-derive = "0.3"
|
||||
|
@ -62,10 +64,12 @@ parse_duration = "2.1"
|
|||
pin-project = "1.0.12"
|
||||
pnet_datalink = "0.34"
|
||||
rand = "0.8"
|
||||
sha1 = "0.10"
|
||||
sha2 = "0.10"
|
||||
timeago = { version = "0.4", default-features = false }
|
||||
xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] }
|
||||
|
||||
aes-gcm = { version = "0.10", features = ["aes", "stream"] }
|
||||
sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" }
|
||||
kuska-handshake = { version = "0.2.0", features = ["default", "async_std"] }
|
||||
|
||||
|
@ -80,7 +84,6 @@ heed = { version = "0.11", default-features = false, features = ["lmdb"] }
|
|||
rusqlite = "0.31.0"
|
||||
r2d2 = "0.8"
|
||||
r2d2_sqlite = "0.24"
|
||||
sled = "0.34"
|
||||
|
||||
async-compression = { version = "0.4", features = ["tokio", "zstd"] }
|
||||
zstd = { version = "0.13", default-features = false }
|
||||
|
|
|
@ -40,7 +40,6 @@ in {
|
|||
features = [
|
||||
"garage/bundled-libs"
|
||||
"garage/k2v"
|
||||
"garage/sled"
|
||||
"garage/lmdb"
|
||||
"garage/sqlite"
|
||||
];
|
||||
|
|
|
@ -98,7 +98,6 @@ paths:
|
|||
type: string
|
||||
example:
|
||||
- "k2v"
|
||||
- "sled"
|
||||
- "lmdb"
|
||||
- "sqlite"
|
||||
- "consul-discovery"
|
||||
|
|
|
@ -80,6 +80,53 @@ To test your new configuration, just reload your Nextcloud webpage and start sen
|
|||
|
||||
*External link:* [Nextcloud Documentation > Primary Storage](https://docs.nextcloud.com/server/latest/admin_manual/configuration_files/primary_storage.html)
|
||||
|
||||
#### SSE-C encryption (since Garage v1.0)
|
||||
|
||||
Since version 1.0, Garage supports server-side encryption with customer keys
|
||||
(SSE-C). In this mode, Garage is responsible for encrypting and decrypting
|
||||
objects, but it does not store the encryption key itself. The encryption key
|
||||
should be provided by Nextcloud upon each request. This mode of operation is
|
||||
supported by Nextcloud and it has successfully been tested together with
|
||||
Garage.
|
||||
|
||||
To enable SSE-C encryption:
|
||||
|
||||
1. Make sure your Garage server is accessible via SSL through a reverse proxy
|
||||
such as Nginx, and that it is using a valid public certificate (Nextcloud
|
||||
might be able to connect to an S3 server that is using a self-signed
|
||||
certificate, but you will lose many hours while trying, so don't).
|
||||
Configure values for `use_ssl` and `port` accordingly in your `config.php`
|
||||
file.
|
||||
|
||||
2. Generate an encryption key using the following command:
|
||||
|
||||
```
|
||||
openssl rand -base64 32
|
||||
```
|
||||
|
||||
Make sure to keep this key **secret**!
|
||||
|
||||
3. Add the encryption key in your `config.php` file as follows:
|
||||
|
||||
|
||||
```php
|
||||
<?php
|
||||
$CONFIG = array(
|
||||
'objectstore' => [
|
||||
'class' => '\\OC\\Files\\ObjectStore\\S3',
|
||||
'arguments' => [
|
||||
...
|
||||
'sse_c_key' => 'exampleencryptionkeyLbU+5fKYQcVoqnn+RaIOXgo=',
|
||||
...
|
||||
],
|
||||
],
|
||||
```
|
||||
|
||||
Nextcloud will now make Garage encrypt files at rest in the storage bucket.
|
||||
These files will not be readable by an S3 client that has credentials to the
|
||||
bucket but doesn't also know the secret encryption key.
|
||||
|
||||
|
||||
### External Storage
|
||||
|
||||
**From the GUI.** Activate the "External storage support" app from the "Applications" page (click on your account icon on the top right corner of your screen to display the menu). Go to your parameters page (also located below your account icon). Click on external storage (or the corresponding translation in your language).
|
||||
|
@ -245,7 +292,7 @@ with average object size ranging from 50 KB to 150 KB.
|
|||
As such, your Garage cluster should be configured appropriately for good performance:
|
||||
|
||||
- use Garage v0.8.0 or higher with the [LMDB database engine](@documentation/reference-manual/configuration.md#db-engine-since-v0-8-0).
|
||||
With the default Sled database engine, your database could quickly end up taking tens of GB of disk space.
|
||||
Older versions of Garage used the Sled database engine which had issues, such as databases quickly ending up taking tens of GB of disk space.
|
||||
- the Garage database should be stored on a SSD
|
||||
|
||||
### Creating your bucket
|
||||
|
|
|
@ -53,20 +53,43 @@ and that's also why your nodes have super long identifiers.
|
|||
|
||||
Adding TLS support built into Garage is not currently planned.
|
||||
|
||||
## Garage stores data in plain text on the filesystem
|
||||
## Garage stores data in plain text on the filesystem or encrypted using customer keys (SSE-C)
|
||||
|
||||
Garage does not handle data encryption at rest by itself, and instead delegates
|
||||
to the user to add encryption, either at the storage layer (LUKS, etc) or on
|
||||
the client side (or both). There are no current plans to add data encryption
|
||||
directly in Garage.
|
||||
For standard S3 API requests, Garage does not encrypt data at rest by itself.
|
||||
For the most generic at rest encryption of data, we recommend setting up your
|
||||
storage partitions on encrypted LUKS devices.
|
||||
|
||||
Implementing data encryption directly in Garage might make things simpler for
|
||||
end users, but also raises many more questions, especially around key
|
||||
management: for encryption of data, where could Garage get the encryption keys
|
||||
from ? If we encrypt data but keep the keys in a plaintext file next to them,
|
||||
it's useless. We probably don't want to have to manage secrets in garage as it
|
||||
would be very hard to do in a secure way. Maybe integrate with an external
|
||||
system such as Hashicorp Vault?
|
||||
If you are developping your own client software that makes use of S3 storage,
|
||||
we recommend implementing data encryption directly on the client side and never
|
||||
transmitting plaintext data to Garage. This makes it easy to use an external
|
||||
untrusted storage provider if necessary.
|
||||
|
||||
Garage does support [SSE-C
|
||||
encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html),
|
||||
an encryption mode of Amazon S3 where data is encrypted at rest using
|
||||
encryption keys given by the client. The encryption keys are passed to the
|
||||
server in a header in each request, to encrypt or decrypt data at the moment of
|
||||
reading or writing. The server discards the key as soon as it has finished
|
||||
using it for the request. This mode allows the data to be encrypted at rest by
|
||||
Garage itself, but it requires support in the client software. It is also not
|
||||
adapted to a model where the server is not trusted or assumed to be
|
||||
compromised, as the server can easily know the encryption keys. Note however
|
||||
that when using SSE-C encryption, the only Garage node that knows the
|
||||
encryption key passed in a given request is the node to which the request is
|
||||
directed (which can be a gateway node), so it is easy to have untrusted nodes
|
||||
in the cluster as long as S3 API requests containing SSE-C encryption keys are
|
||||
not directed to them.
|
||||
|
||||
Implementing automatic data encryption directly in Garage without client-side
|
||||
management of keys (something like
|
||||
[SSE-S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingServerSideEncryption.html))
|
||||
could make things simpler for end users that don't want to setup LUKS, but also
|
||||
raises many more questions, especially around key management: for encryption of
|
||||
data, where could Garage get the encryption keys from? If we encrypt data but
|
||||
keep the keys in a plaintext file next to them, it's useless. We probably don't
|
||||
want to have to manage secrets in Garage as it would be very hard to do in a
|
||||
secure way. At the time of speaking, there are no plans to implement this in
|
||||
Garage.
|
||||
|
||||
|
||||
# Adding data encryption using external tools
|
||||
|
|
|
@ -91,6 +91,5 @@ The following feature flags are available in v0.8.0:
|
|||
| `metrics` | *by default* | Enable collection of metrics in Prometheus format on the admin API |
|
||||
| `telemetry-otlp` | optional | Enable collection of execution traces using OpenTelemetry |
|
||||
| `syslog` | optional | Enable logging to Syslog |
|
||||
| `sled` | *by default* | Enable using Sled to store Garage's metadata |
|
||||
| `lmdb` | *by default* | Enable using LMDB to store Garage's metadata |
|
||||
| `sqlite` | *by default* | Enable using Sqlite3 to store Garage's metadata |
|
||||
|
|
|
@ -90,19 +90,20 @@ to store 2 TB of data in total.
|
|||
- If you only have an HDD and no SSD, it's fine to put your metadata alongside
|
||||
the data on the same drive, but then consider your filesystem choice wisely
|
||||
(see above). Having lots of RAM for your kernel to cache the metadata will
|
||||
help a lot with performance.
|
||||
help a lot with performance. The default LMDB database engine is the most
|
||||
tested and has good performance.
|
||||
|
||||
## Get a Docker image
|
||||
|
||||
Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated).
|
||||
We encourage you to use a fixed tag (eg. `v0.9.4`) and not the `latest` tag.
|
||||
For this example, we will use the latest published version at the time of the writing which is `v0.9.4` but it's up to you
|
||||
We encourage you to use a fixed tag (eg. `v1.0.0`) and not the `latest` tag.
|
||||
For this example, we will use the latest published version at the time of the writing which is `v1.0.0` but it's up to you
|
||||
to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated).
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
sudo docker pull dxflrs/garage:v0.9.4
|
||||
sudo docker pull dxflrs/garage:v1.0.0
|
||||
```
|
||||
|
||||
## Deploying and configuring Garage
|
||||
|
@ -127,7 +128,7 @@ data_dir = "/var/lib/garage/data"
|
|||
db_engine = "lmdb"
|
||||
metadata_auto_snapshot_interval = "6h"
|
||||
|
||||
replication_mode = "3"
|
||||
replication_factor = 3
|
||||
|
||||
compression_level = 2
|
||||
|
||||
|
@ -168,7 +169,7 @@ docker run \
|
|||
-v /etc/garage.toml:/etc/garage.toml \
|
||||
-v /var/lib/garage/meta:/var/lib/garage/meta \
|
||||
-v /var/lib/garage/data:/var/lib/garage/data \
|
||||
dxflrs/garage:v0.9.4
|
||||
dxflrs/garage:v1.0.0
|
||||
```
|
||||
|
||||
With this command line, Garage should be started automatically at each boot.
|
||||
|
@ -182,7 +183,7 @@ If you want to use `docker-compose`, you may use the following `docker-compose.y
|
|||
version: "3"
|
||||
services:
|
||||
garage:
|
||||
image: dxflrs/garage:v0.9.4
|
||||
image: dxflrs/garage:v1.0.0
|
||||
network_mode: "host"
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
|
|
|
@ -97,7 +97,7 @@ delete a tombstone, the following condition has to be met:
|
|||
superseeded by the tombstone. This ensures that deleting the tombstone is
|
||||
safe and that no deleted value will come back in the system.
|
||||
|
||||
Garage makes use of Sled's atomic operations (such as compare-and-swap and
|
||||
Garage uses atomic database operations (such as compare-and-swap and
|
||||
transactions) to ensure that only tombstones that have been correctly
|
||||
propagated to other nodes are ever deleted from the local entry tree.
|
||||
|
||||
|
|
|
@ -141,4 +141,7 @@ blocks may still be held by Garage. If you suspect that such corruption has occu
|
|||
in your cluster, you can run one of the following repair procedures:
|
||||
|
||||
- `garage repair versions`: checks that all versions belong to a non-deleted object, and purges any orphan version
|
||||
- `garage repair block_refs`: checks that all block references belong to a non-deleted object version, and purges any orphan block reference (this will then allow the blocks to be garbage-collected)
|
||||
|
||||
- `garage repair block-refs`: checks that all block references belong to a non-deleted object version, and purges any orphan block reference (this will then allow the blocks to be garbage-collected)
|
||||
|
||||
- `garage repair block-rc`: checks that the reference counters for blocks are in sync with the actual number of non-deleted entries in the block reference table
|
||||
|
|
|
@ -12,7 +12,7 @@ An introduction to building cluster layouts can be found in the [production depl
|
|||
In Garage, all of the data that can be stored in a given cluster is divided
|
||||
into slices which we call *partitions*. Each partition is stored by
|
||||
one or several nodes in the cluster
|
||||
(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication_mode)).
|
||||
(see [`replication_factor`](@/documentation/reference-manual/configuration.md#replication_factor)).
|
||||
The layout determines the correspondence between these partitions,
|
||||
which exist on a logical level, and actual storage nodes.
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ metadata_dir = "/tmp/meta"
|
|||
data_dir = "/tmp/data"
|
||||
db_engine = "sqlite"
|
||||
|
||||
replication_mode = "none"
|
||||
replication_factor = 1
|
||||
|
||||
rpc_bind_addr = "[::]:3901"
|
||||
rpc_public_addr = "127.0.0.1:3901"
|
||||
|
|
|
@ -8,7 +8,8 @@ weight = 20
|
|||
Here is an example `garage.toml` configuration file that illustrates all of the possible options:
|
||||
|
||||
```toml
|
||||
replication_mode = "3"
|
||||
replication_factor = 3
|
||||
consistency_mode = "consistent"
|
||||
|
||||
metadata_dir = "/var/lib/garage/meta"
|
||||
data_dir = "/var/lib/garage/data"
|
||||
|
@ -22,8 +23,6 @@ db_engine = "lmdb"
|
|||
block_size = "1M"
|
||||
block_ram_buffer_max = "256MiB"
|
||||
|
||||
sled_cache_capacity = "128MiB"
|
||||
sled_flush_every_ms = 2000
|
||||
lmdb_map_size = "1T"
|
||||
|
||||
compression_level = 1
|
||||
|
@ -101,13 +100,12 @@ Top-level configuration options:
|
|||
[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval),
|
||||
[`metadata_dir`](#metadata_dir),
|
||||
[`metadata_fsync`](#metadata_fsync),
|
||||
[`replication_mode`](#replication_mode),
|
||||
[`replication_factor`](#replication_factor),
|
||||
[`consistency_mode`](#consistency_mode),
|
||||
[`rpc_bind_addr`](#rpc_bind_addr),
|
||||
[`rpc_bind_outgoing`](#rpc_bind_outgoing),
|
||||
[`rpc_public_addr`](#rpc_public_addr),
|
||||
[`rpc_secret`/`rpc_secret_file`](#rpc_secret),
|
||||
[`sled_cache_capacity`](#sled_cache_capacity),
|
||||
[`sled_flush_every_ms`](#sled_flush_every_ms).
|
||||
[`rpc_secret`/`rpc_secret_file`](#rpc_secret).
|
||||
|
||||
The `[consul_discovery]` section:
|
||||
[`api`](#consul_api),
|
||||
|
@ -161,11 +159,12 @@ values in the configuration file:
|
|||
|
||||
### Top-level configuration options
|
||||
|
||||
#### `replication_mode` {#replication_mode}
|
||||
#### `replication_factor` {#replication_factor}
|
||||
|
||||
Garage supports the following replication modes:
|
||||
The replication factor can be any positive integer smaller or equal the node count in your cluster.
|
||||
The chosen replication factor has a big impact on the cluster's failure tolerancy and performance characteristics.
|
||||
|
||||
- `none` or `1`: data stored on Garage is stored on a single node. There is no
|
||||
- `1`: data stored on Garage is stored on a single node. There is no
|
||||
redundancy, and data will be unavailable as soon as one node fails or its
|
||||
network is disconnected. Do not use this for anything else than test
|
||||
deployments.
|
||||
|
@ -176,17 +175,6 @@ Garage supports the following replication modes:
|
|||
before losing data. Data remains available in read-only mode when one node is
|
||||
down, but write operations will fail.
|
||||
|
||||
- `2-dangerous`: a variant of mode `2`, where written objects are written to
|
||||
the second replica asynchronously. This means that Garage will return `200
|
||||
OK` to a PutObject request before the second copy is fully written (or even
|
||||
before it even starts being written). This means that data can more easily
|
||||
be lost if the node crashes before a second copy can be completed. This
|
||||
also means that written objects might not be visible immediately in read
|
||||
operations. In other words, this mode severely breaks the consistency and
|
||||
durability guarantees of standard Garage cluster operation. Benefits of
|
||||
this mode: you can still write to your cluster when one node is
|
||||
unavailable.
|
||||
|
||||
- `3`: data stored on Garage will be stored on three different nodes, if
|
||||
possible each in a different zones. Garage tolerates two node failure, or
|
||||
several node failures but in no more than two zones (in a deployment with at
|
||||
|
@ -194,55 +182,84 @@ Garage supports the following replication modes:
|
|||
or node failures are only in a single zone, reading and writing data to
|
||||
Garage can continue normally.
|
||||
|
||||
- `3-degraded`: a variant of replication mode `3`, that lowers the read
|
||||
quorum to `1`, to allow you to read data from your cluster when several
|
||||
nodes (or nodes in several zones) are unavailable. In this mode, Garage
|
||||
does not provide read-after-write consistency anymore. The write quorum is
|
||||
still 2, ensuring that data successfully written to Garage is stored on at
|
||||
least two nodes.
|
||||
|
||||
- `3-dangerous`: a variant of replication mode `3` that lowers both the read
|
||||
and write quorums to `1`, to allow you to both read and write to your
|
||||
cluster when several nodes (or nodes in several zones) are unavailable. It
|
||||
is the least consistent mode of operation proposed by Garage, and also one
|
||||
that should probably never be used.
|
||||
- `5`, `7`, ...: When setting the replication factor above 3, it is most useful to
|
||||
choose an uneven value, since for every two copies added, one more node can fail
|
||||
before losing the ability to write and read to the cluster.
|
||||
|
||||
Note that in modes `2` and `3`,
|
||||
if at least the same number of zones are available, an arbitrary number of failures in
|
||||
any given zone is tolerated as copies of data will be spread over several zones.
|
||||
|
||||
**Make sure `replication_mode` is the same in the configuration files of all nodes.
|
||||
**Make sure `replication_factor` is the same in the configuration files of all nodes.
|
||||
Never run a Garage cluster where that is not the case.**
|
||||
|
||||
It is technically possible to change the replication factor although it's a
|
||||
dangerous operation that is not officially supported. This requires you to
|
||||
delete the existing cluster layout and create a new layout from scratch,
|
||||
meaning that a full rebalancing of your cluster's data will be needed. To do
|
||||
it, shut down your cluster entirely, delete the `custer_layout` files in the
|
||||
meta directories of all your nodes, update all your configuration files with
|
||||
the new `replication_factor` parameter, restart your cluster, and then create a
|
||||
new layout with all the nodes you want to keep. Rebalancing data will take
|
||||
some time, and data might temporarily appear unavailable to your users.
|
||||
It is recommended to shut down public access to the cluster while rebalancing
|
||||
is in progress. In theory, no data should be lost as rebalancing is a
|
||||
routine operation for Garage, although we cannot guarantee you that everything
|
||||
will go right in such an extreme scenario.
|
||||
|
||||
#### `consistency_mode` {#consistency_mode}
|
||||
|
||||
The consistency mode setting determines the read and write behaviour of your cluster.
|
||||
|
||||
- `consistent`: The default setting. This is what the paragraph above describes.
|
||||
The read and write quorum will be determined so that read-after-write consistency
|
||||
is guaranteed.
|
||||
- `degraded`: Lowers the read
|
||||
quorum to `1`, to allow you to read data from your cluster when several
|
||||
nodes (or nodes in several zones) are unavailable. In this mode, Garage
|
||||
does not provide read-after-write consistency anymore.
|
||||
The write quorum stays the same as in the `consistent` mode, ensuring that
|
||||
data successfully written to Garage is stored on multiple nodes (depending
|
||||
the replication factor).
|
||||
- `dangerous`: This mode lowers both the read
|
||||
and write quorums to `1`, to allow you to both read and write to your
|
||||
cluster when several nodes (or nodes in several zones) are unavailable. It
|
||||
is the least consistent mode of operation proposed by Garage, and also one
|
||||
that should probably never be used.
|
||||
|
||||
Changing the `consistency_mode` between modes while leaving the `replication_factor` untouched
|
||||
(e.g. setting your node's `consistency_mode` to `degraded` when it was previously unset, or from
|
||||
`dangerous` to `consistent`), can be done easily by just changing the `consistency_mode`
|
||||
parameter in your config files and restarting all your Garage nodes.
|
||||
|
||||
The consistency mode can be used together with various replication factors, to achieve
|
||||
a wide range of read and write characteristics. Some examples:
|
||||
|
||||
- Replication factor `2`, consistency mode `degraded`: While this mode
|
||||
technically exists, its properties are the same as with consistency mode `consistent`,
|
||||
since the read quorum with replication factor `2`, consistency mode `consistent` is already 1.
|
||||
|
||||
- Replication factor `2`, consistency mode `dangerous`: written objects are written to
|
||||
the second replica asynchronously. This means that Garage will return `200
|
||||
OK` to a PutObject request before the second copy is fully written (or even
|
||||
before it even starts being written). This means that data can more easily
|
||||
be lost if the node crashes before a second copy can be completed. This
|
||||
also means that written objects might not be visible immediately in read
|
||||
operations. In other words, this configuration severely breaks the consistency and
|
||||
durability guarantees of standard Garage cluster operation. Benefits of
|
||||
this configuration: you can still write to your cluster when one node is
|
||||
unavailable.
|
||||
|
||||
The quorums associated with each replication mode are described below:
|
||||
|
||||
| `replication_mode` | Number of replicas | Write quorum | Read quorum | Read-after-write consistency? |
|
||||
| ------------------ | ------------------ | ------------ | ----------- | ----------------------------- |
|
||||
| `none` or `1` | 1 | 1 | 1 | yes |
|
||||
| `2` | 2 | 2 | 1 | yes |
|
||||
| `2-dangerous` | 2 | 1 | 1 | NO |
|
||||
| `3` | 3 | 2 | 2 | yes |
|
||||
| `3-degraded` | 3 | 2 | 1 | NO |
|
||||
| `3-dangerous` | 3 | 1 | 1 | NO |
|
||||
|
||||
Changing the `replication_mode` between modes with the same number of replicas
|
||||
(e.g. from `3` to `3-degraded`, or from `2-dangerous` to `2`), can be done easily by
|
||||
just changing the `replication_mode` parameter in your config files and restarting all your
|
||||
Garage nodes.
|
||||
|
||||
It is also technically possible to change the replication mode to a mode with a
|
||||
different numbers of replicas, although it's a dangerous operation that is not
|
||||
officially supported. This requires you to delete the existing cluster layout
|
||||
and create a new layout from scratch, meaning that a full rebalancing of your
|
||||
cluster's data will be needed. To do it, shut down your cluster entirely,
|
||||
delete the `custer_layout` files in the meta directories of all your nodes,
|
||||
update all your configuration files with the new `replication_mode` parameter,
|
||||
restart your cluster, and then create a new layout with all the nodes you want
|
||||
to keep. Rebalancing data will take some time, and data might temporarily
|
||||
appear unavailable to your users. It is recommended to shut down public access
|
||||
to the cluster while rebalancing is in progress. In theory, no data should be
|
||||
lost as rebalancing is a routine operation for Garage, although we cannot
|
||||
guarantee you that everything will go right in such an extreme scenario.
|
||||
| `consistency_mode` | `replication_factor` | Write quorum | Read quorum | Read-after-write consistency? |
|
||||
| ------------------ | -------------------- | ------------ | ----------- | ----------------------------- |
|
||||
| `consistent` | 1 | 1 | 1 | yes |
|
||||
| `consistent` | 2 | 2 | 1 | yes |
|
||||
| `dangerous` | 2 | 1 | 1 | NO |
|
||||
| `consistent` | 3 | 2 | 2 | yes |
|
||||
| `degraded` | 3 | 2 | 1 | NO |
|
||||
| `dangerous` | 3 | 1 | 1 | NO |
|
||||
|
||||
#### `metadata_dir` {#metadata_dir}
|
||||
|
||||
|
@ -278,23 +295,18 @@ Since `v0.8.0`, Garage can use alternative storage backends as follows:
|
|||
|
||||
| DB engine | `db_engine` value | Database path |
|
||||
| --------- | ----------------- | ------------- |
|
||||
| [LMDB](https://www.lmdb.tech) (default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` |
|
||||
| [Sled](https://sled.rs) (default up to `v0.8.0`) | `"sled"` | `<metadata_dir>/db/` |
|
||||
| [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` |
|
||||
| [LMDB](https://www.lmdb.tech) (since `v0.8.0`, default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` |
|
||||
| [Sqlite](https://sqlite.org) (since `v0.8.0`) | `"sqlite"` | `<metadata_dir>/db.sqlite` |
|
||||
| [Sled](https://sled.rs) (old default, removed since `v1.0`) | `"sled"` | `<metadata_dir>/db/` |
|
||||
|
||||
Sled was the only database engine up to Garage v0.7.0. Performance issues and
|
||||
API limitations of Sled prompted the addition of alternative engines in v0.8.0.
|
||||
Since v0.9.0, LMDB is the default engine instead of Sled, and Sled is
|
||||
deprecated. We plan to remove Sled in Garage v1.0.
|
||||
Sled was supported until Garage v0.9.x, and was removed in Garage v1.0.
|
||||
You can still use an older binary of Garage (e.g. v0.9.4) to migrate
|
||||
old Sled metadata databases to another engine.
|
||||
|
||||
Performance characteristics of the different DB engines are as follows:
|
||||
|
||||
- Sled: tends to produce large data files and also has performance issues,
|
||||
especially when the metadata folder is on a traditional HDD and not on SSD.
|
||||
|
||||
- LMDB: the recommended database engine for high-performance distributed
|
||||
clusters, much more space-efficient and significantly faster. LMDB works very
|
||||
well, but is known to have the following limitations:
|
||||
- LMDB: the recommended database engine for high-performance distributed clusters.
|
||||
LMDB works very well, but is known to have the following limitations:
|
||||
|
||||
- The data format of LMDB is not portable between architectures, so for
|
||||
instance the Garage database of an x86-64 node cannot be moved to an ARM64
|
||||
|
@ -310,6 +322,9 @@ Performance characteristics of the different DB engines are as follows:
|
|||
other nodes), or if you have saved regular snapshots at the filesystem
|
||||
level.
|
||||
|
||||
- Keys in LMDB are limited to 511 bytes. This limit translates to limits on
|
||||
object keys in S3 and sort keys in K2V that are limted to 479 bytes.
|
||||
|
||||
- Sqlite: Garage supports Sqlite as an alternative storage backend for
|
||||
metadata, which does not have the issues listed above for LMDB.
|
||||
On versions 0.8.x and earlier, Sqlite should be avoided due to abysmal
|
||||
|
@ -353,7 +368,6 @@ Here is how this option impacts the different database engines:
|
|||
|
||||
| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` |
|
||||
|----------|------------------------------------|-------------------------------|
|
||||
| Sled | default options | *unsupported* |
|
||||
| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
|
||||
| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
|
||||
|
||||
|
@ -455,21 +469,6 @@ node.
|
|||
|
||||
The default value is 256MiB.
|
||||
|
||||
#### `sled_cache_capacity` {#sled_cache_capacity}
|
||||
|
||||
This parameter can be used to tune the capacity of the cache used by
|
||||
[sled](https://sled.rs), the database Garage uses internally to store metadata.
|
||||
Tune this to fit the RAM you wish to make available to your Garage instance.
|
||||
This value has a conservative default (128MB) so that Garage doesn't use too much
|
||||
RAM by default, but feel free to increase this for higher performance.
|
||||
|
||||
#### `sled_flush_every_ms` {#sled_flush_every_ms}
|
||||
|
||||
This parameters can be used to tune the flushing interval of sled.
|
||||
Increase this if sled is thrashing your SSD, at the risk of losing more data in case
|
||||
of a power outage (though this should not matter much as data is replicated on other
|
||||
nodes). The default value, 2000ms, should be appropriate for most use cases.
|
||||
|
||||
#### `lmdb_map_size` {#lmdb_map_size}
|
||||
|
||||
This parameters can be used to set the map size used by LMDB,
|
||||
|
|
|
@ -39,10 +39,10 @@ Read about cluster layout management [here](@/documentation/operations/layout.md
|
|||
|
||||
### Several replication modes
|
||||
|
||||
Garage supports a variety of replication modes, with 1 copy, 2 copies or 3 copies of your data,
|
||||
Garage supports a variety of replication modes, with configurable replica count,
|
||||
and with various levels of consistency, in order to adapt to a variety of usage scenarios.
|
||||
Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_mode)
|
||||
to select the replication mode best suited to your use case (hint: in most cases, `replication_mode = "3"` is what you want).
|
||||
Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_factor)
|
||||
to select the replication mode best suited to your use case (hint: in most cases, `replication_factor = 3` is what you want).
|
||||
|
||||
### Compression and deduplication
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ Feel free to open a PR to suggest fixes this table. Minio is missing because the
|
|||
| [URL path-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access) (eg. `host.tld/bucket/key`) | ✅ Implemented | ✅ | ✅ | ❓| ✅ |
|
||||
| [URL vhost-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access) URL (eg. `bucket.host.tld/key`) | ✅ Implemented | ❌| ✅| ✅ | ✅ |
|
||||
| [Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) | ✅ Implemented | ❌| ✅ | ✅ | ✅(❓) |
|
||||
| [SSE-C encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html) | ✅ Implemented | ❓ | ✅ | ❌ | ✅ |
|
||||
|
||||
*Note:* OpenIO does not says if it supports presigned URLs. Because it is part
|
||||
of signature v4 and they claim they support it without additional precisions,
|
||||
|
|
77
doc/book/working-documents/migration-1.md
Normal file
77
doc/book/working-documents/migration-1.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
+++
|
||||
title = "Migrating from 0.9 to 1.0"
|
||||
weight = 11
|
||||
+++
|
||||
|
||||
**This guide explains how to migrate to 1.0 if you have an existing 0.9 cluster.
|
||||
We don't recommend trying to migrate to 1.0 directly from 0.8 or older.**
|
||||
|
||||
This migration procedure has been tested on several clusters without issues.
|
||||
However, it is still a *critical procedure* that might cause issues.
|
||||
**Make sure to back up all your data before attempting it!**
|
||||
|
||||
You might also want to read our [general documentation on upgrading Garage](@/documentation/operations/upgrading.md).
|
||||
|
||||
## Changes introduced in v1.0
|
||||
|
||||
The following are **breaking changes** in Garage v1.0 that require your attention when migrating:
|
||||
|
||||
- The Sled metadata db engine has been **removed**. If your cluster was still
|
||||
using Sled, you will need to **use a Garage v0.9.x binary** to convert the
|
||||
database using the `garage convert-db` subcommand. See
|
||||
[here](@/documentation/reference-manual/configuration/#db_engine) for the
|
||||
details of the procedure.
|
||||
|
||||
The following syntax changes have been made to the configuration file:
|
||||
|
||||
- The `replication_mode` parameter has been split into two parameters:
|
||||
[`replication_factor`](@/documentation/reference-manual/configuration/#replication_factor)
|
||||
and
|
||||
[`consistency_mode`](@/documentation/reference-manual/configuration/#consistency_mode).
|
||||
The old syntax using `replication_mode` is still supported for legacy
|
||||
reasons and can still be used.
|
||||
|
||||
- The parameters `sled_cache_capacity` and `sled_flush_every_ms` have been removed.
|
||||
|
||||
## Migration procedure
|
||||
|
||||
The migration to Garage v1.0 can be done with almost no downtime,
|
||||
by restarting all nodes at once in the new version.
|
||||
|
||||
The migration steps are as follows:
|
||||
|
||||
1. Do a `garage repair --all-nodes --yes tables`, check the logs and check that
|
||||
all data seems to be synced correctly between nodes. If you have time, do
|
||||
additional `garage repair` procedures (`blocks`, `versions`, `block_refs`,
|
||||
etc.)
|
||||
|
||||
2. Ensure you have a snapshot of your Garage installation that you can restore
|
||||
to in case the upgrade goes wrong:
|
||||
|
||||
- If you are running Garage v0.9.4 or later, use the `garage meta snapshot
|
||||
--all` to make a backup snapshot of the metadata directories of your nodes
|
||||
for backup purposes, and save a copy of the following files in the
|
||||
metadata directories of your nodes: `cluster_layout`, `data_layout`,
|
||||
`node_key`, `node_key.pub`.
|
||||
|
||||
- If you are running a filesystem such as ZFS or BTRFS that support
|
||||
snapshotting, you can create a filesystem-level snapshot to be used as a
|
||||
restoration point if needed.
|
||||
|
||||
- In other cases, make a backup using the old procedure: turn off each node
|
||||
individually; back up its metadata folder (for instance, use the following
|
||||
command if your metadata directory is `/var/lib/garage/meta`: `cd
|
||||
/var/lib/garage ; tar -acf meta-v0.9.tar.zst meta/`); turn it back on
|
||||
again. This will allow you to take a backup of all nodes without
|
||||
impacting global cluster availability. You can do all nodes of a single
|
||||
zone at once as this does not impact the availability of Garage.
|
||||
|
||||
3. Prepare your updated binaries and configuration files for Garage v1.0
|
||||
|
||||
4. Shut down all v0.9 nodes simultaneously, and restart them all simultaneously
|
||||
in v1.0. Use your favorite deployment tool (Ansible, Kubernetes, Nomad) to
|
||||
achieve this as fast as possible. Garage v1.0 should be in a working state
|
||||
as soon as enough nodes have started.
|
||||
|
||||
5. Monitor your cluster in the following hours to see if it works well under
|
||||
your production load.
|
|
@ -69,11 +69,10 @@ Example response body:
|
|||
|
||||
```json
|
||||
{
|
||||
"node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
|
||||
"garageVersion": "git:v0.9.0-dev",
|
||||
"node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
|
||||
"garageVersion": "v1.0.0",
|
||||
"garageFeatures": [
|
||||
"k2v",
|
||||
"sled",
|
||||
"lmdb",
|
||||
"sqlite",
|
||||
"metrics",
|
||||
|
@ -81,84 +80,93 @@ Example response body:
|
|||
],
|
||||
"rustVersion": "1.68.0",
|
||||
"dbEngine": "LMDB (using Heed crate)",
|
||||
"knownNodes": [
|
||||
"layoutVersion": 5,
|
||||
"nodes": [
|
||||
{
|
||||
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
|
||||
"addr": "10.0.0.11:3901",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 9,
|
||||
"hostname": "node1"
|
||||
},
|
||||
{
|
||||
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
|
||||
"addr": "10.0.0.12:3901",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 1,
|
||||
"hostname": "node2"
|
||||
},
|
||||
{
|
||||
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
|
||||
"addr": "10.0.0.21:3901",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 7,
|
||||
"hostname": "node3"
|
||||
},
|
||||
{
|
||||
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
|
||||
"addr": "10.0.0.22:3901",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 1,
|
||||
"hostname": "node4"
|
||||
}
|
||||
],
|
||||
"layout": {
|
||||
"version": 12,
|
||||
"roles": [
|
||||
{
|
||||
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
|
||||
"id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
|
||||
"role": {
|
||||
"id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
|
||||
"zone": "dc1",
|
||||
"capacity": 10737418240,
|
||||
"tags": [
|
||||
"node1"
|
||||
]
|
||||
"capacity": 100000000000,
|
||||
"tags": []
|
||||
},
|
||||
"addr": "10.0.0.3:3901",
|
||||
"hostname": "node3",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 12,
|
||||
"draining": false,
|
||||
"dataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
},
|
||||
"metadataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
|
||||
"id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f",
|
||||
"role": null,
|
||||
"addr": "10.0.0.2:3901",
|
||||
"hostname": "node2",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 11,
|
||||
"draining": true,
|
||||
"dataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
},
|
||||
"metadataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
|
||||
"role": {
|
||||
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
|
||||
"zone": "dc1",
|
||||
"capacity": 10737418240,
|
||||
"tags": [
|
||||
"node2"
|
||||
]
|
||||
"capacity": 100000000000,
|
||||
"tags": []
|
||||
},
|
||||
"addr": "127.0.0.1:3904",
|
||||
"hostname": "lindy",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 2,
|
||||
"draining": false,
|
||||
"dataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
},
|
||||
"metadataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
|
||||
"zone": "dc2",
|
||||
"capacity": 10737418240,
|
||||
"tags": [
|
||||
"node3"
|
||||
]
|
||||
"id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
|
||||
"role": {
|
||||
"id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
|
||||
"zone": "dc1",
|
||||
"capacity": 100000000000,
|
||||
"tags": []
|
||||
},
|
||||
"addr": "10.0.0.1:3901",
|
||||
"hostname": "node1",
|
||||
"isUp": true,
|
||||
"lastSeenSecsAgo": 3,
|
||||
"draining": false,
|
||||
"dataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
},
|
||||
"metadataPartition": {
|
||||
"available": 660270088192,
|
||||
"total": 873862266880
|
||||
}
|
||||
],
|
||||
"stagedRoleChanges": [
|
||||
{
|
||||
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
|
||||
"remove": false,
|
||||
"zone": "dc2",
|
||||
"capacity": 10737418240,
|
||||
"tags": [
|
||||
"node4"
|
||||
]
|
||||
}
|
||||
{
|
||||
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
|
||||
"remove": true,
|
||||
"zone": null,
|
||||
"capacity": null,
|
||||
"tags": null,
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### GetClusterHealth `GET /v1/health`
|
||||
|
|
|
@ -146,7 +146,7 @@ in a bucket, as the partition key becomes the sort key in the index.
|
|||
How indexing works:
|
||||
|
||||
- Each node keeps a local count of how many items it stores for each partition,
|
||||
in a local Sled tree that is updated atomically when an item is modified.
|
||||
in a local database tree that is updated atomically when an item is modified.
|
||||
- These local counters are asynchronously stored in the index table which is
|
||||
a regular Garage table spread in the network. Counters are stored as LWW values,
|
||||
so basically the final table will have the following structure:
|
||||
|
|
|
@ -168,7 +168,7 @@ let
|
|||
rootFeatures = if features != null then
|
||||
features
|
||||
else
|
||||
([ "garage/bundled-libs" "garage/sled" "garage/lmdb" "garage/sqlite" "garage/k2v" ] ++ (if release then [
|
||||
([ "garage/bundled-libs" "garage/lmdb" "garage/sqlite" "garage/k2v" ] ++ (if release then [
|
||||
"garage/consul-discovery"
|
||||
"garage/kubernetes-discovery"
|
||||
"garage/metrics"
|
||||
|
|
|
@ -15,10 +15,10 @@ type: application
|
|||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.4.2
|
||||
version: 0.5.0
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "v0.9.4"
|
||||
appVersion: "v1.0.0"
|
||||
|
|
|
@ -6,18 +6,13 @@
|
|||
garage:
|
||||
# Can be changed for better performance on certain systems
|
||||
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0
|
||||
dbEngine: "sled"
|
||||
dbEngine: "lmdb"
|
||||
|
||||
# Defaults is 1MB
|
||||
# An increase can result in better performance in certain scenarios
|
||||
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#block-size
|
||||
blockSize: "1048576"
|
||||
|
||||
# Tuning parameters for the sled DB engine
|
||||
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#sled-cache-capacity
|
||||
sledCacheCapacity: "134217728"
|
||||
sledFlushEveryMs: "2000"
|
||||
|
||||
# Default to 3 replicas, see the replication_mode section at
|
||||
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#replication-mode
|
||||
replicationMode: "3"
|
||||
|
@ -50,11 +45,6 @@ garage:
|
|||
|
||||
block_size = {{ .Values.garage.blockSize }}
|
||||
|
||||
{{- if eq .Values.garage.dbEngine "sled"}}
|
||||
sled_cache_capacity = {{ .Values.garage.sledCacheCapacity }}
|
||||
sled_flush_every_ms = {{ .Values.garage.sledFlushEveryMs }}
|
||||
{{- end }}
|
||||
|
||||
replication_mode = "{{ .Values.garage.replicationMode }}"
|
||||
|
||||
compression_level = {{ .Values.garage.compressionLevel }}
|
||||
|
|
|
@ -82,6 +82,19 @@ if [ -z "$SKIP_AWS" ]; then
|
|||
exit 1
|
||||
fi
|
||||
aws s3api delete-object --bucket eprouvette --key upload
|
||||
|
||||
echo "🛠️ Test SSE-C with awscli (aws s3)"
|
||||
SSEC_KEY="u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y="
|
||||
SSEC_KEY_MD5="jMGbs3GyZkYjJUP6q5jA7g=="
|
||||
echo "$SSEC_KEY" | base64 -d > /tmp/garage.ssec-key
|
||||
for idx in {1,2}.rnd; do
|
||||
aws s3 cp --sse-c AES256 --sse-c-key fileb:///tmp/garage.ssec-key \
|
||||
"/tmp/garage.$idx" "s3://eprouvette/garage.$idx.aws.sse-c"
|
||||
aws s3 cp --sse-c AES256 --sse-c-key fileb:///tmp/garage.ssec-key \
|
||||
"s3://eprouvette/garage.$idx.aws.sse-c" "/tmp/garage.$idx.dl.sse-c"
|
||||
diff "/tmp/garage.$idx" "/tmp/garage.$idx.dl.sse-c"
|
||||
aws s3api delete-object --bucket eprouvette --key "garage.$idx.aws.sse-c"
|
||||
done
|
||||
fi
|
||||
|
||||
# S3CMD
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage_api"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
@ -21,11 +21,15 @@ garage_net.workspace = true
|
|||
garage_util.workspace = true
|
||||
garage_rpc.workspace = true
|
||||
|
||||
aes-gcm.workspace = true
|
||||
argon2.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono.workspace = true
|
||||
crc32fast.workspace = true
|
||||
crc32c.workspace = true
|
||||
crypto-common.workspace = true
|
||||
err-derive.workspace = true
|
||||
hex.workspace = true
|
||||
|
@ -35,12 +39,14 @@ tracing.workspace = true
|
|||
md-5.workspace = true
|
||||
nom.workspace = true
|
||||
pin-project.workspace = true
|
||||
sha1.workspace = true
|
||||
sha2.workspace = true
|
||||
|
||||
futures.workspace = true
|
||||
futures-util.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
form_urlencoded.workspace = true
|
||||
http.workspace = true
|
||||
|
|
|
@ -276,7 +276,7 @@ impl ApiHandler for AdminApiServer {
|
|||
Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await,
|
||||
Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await,
|
||||
Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await,
|
||||
Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await,
|
||||
Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage).await,
|
||||
// Keys
|
||||
Endpoint::ListKeys => handle_list_keys(&self.garage).await,
|
||||
Endpoint::GetKeyInfo {
|
||||
|
|
|
@ -123,7 +123,7 @@ async fn bucket_info_results(
|
|||
.table
|
||||
.get(&bucket_id, &EmptyKey)
|
||||
.await?
|
||||
.map(|x| x.filtered_values(&garage.system.ring.borrow()))
|
||||
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
||||
.unwrap_or_default();
|
||||
|
||||
let mpu_counters = garage
|
||||
|
@ -131,7 +131,7 @@ async fn bucket_info_results(
|
|||
.table
|
||||
.get(&bucket_id, &EmptyKey)
|
||||
.await?
|
||||
.map(|x| x.filtered_values(&garage.system.ring.borrow()))
|
||||
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut relevant_keys = HashMap::new();
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use std::collections::HashMap;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
|
@ -16,25 +17,99 @@ use crate::admin::error::*;
|
|||
use crate::helpers::{json_ok_response, parse_json_body};
|
||||
|
||||
pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> {
|
||||
let layout = garage.system.cluster_layout();
|
||||
let mut nodes = garage
|
||||
.system
|
||||
.get_known_nodes()
|
||||
.into_iter()
|
||||
.map(|i| {
|
||||
(
|
||||
i.id,
|
||||
NodeResp {
|
||||
id: hex::encode(i.id),
|
||||
addr: i.addr,
|
||||
hostname: i.status.hostname,
|
||||
is_up: i.is_up,
|
||||
last_seen_secs_ago: i.last_seen_secs_ago,
|
||||
data_partition: i
|
||||
.status
|
||||
.data_disk_avail
|
||||
.map(|(avail, total)| FreeSpaceResp {
|
||||
available: avail,
|
||||
total,
|
||||
}),
|
||||
metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| {
|
||||
FreeSpaceResp {
|
||||
available: avail,
|
||||
total,
|
||||
}
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
for (id, _, role) in layout.current().roles.items().iter() {
|
||||
if let layout::NodeRoleV(Some(r)) = role {
|
||||
let role = NodeRoleResp {
|
||||
id: hex::encode(id),
|
||||
zone: r.zone.to_string(),
|
||||
capacity: r.capacity,
|
||||
tags: r.tags.clone(),
|
||||
};
|
||||
match nodes.get_mut(id) {
|
||||
None => {
|
||||
nodes.insert(
|
||||
*id,
|
||||
NodeResp {
|
||||
id: hex::encode(id),
|
||||
role: Some(role),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
}
|
||||
Some(n) => {
|
||||
n.role = Some(role);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ver in layout.versions().iter().rev().skip(1) {
|
||||
for (id, _, role) in ver.roles.items().iter() {
|
||||
if let layout::NodeRoleV(Some(r)) = role {
|
||||
if r.capacity.is_some() {
|
||||
if let Some(n) = nodes.get_mut(id) {
|
||||
if n.role.is_none() {
|
||||
n.draining = true;
|
||||
}
|
||||
} else {
|
||||
nodes.insert(
|
||||
*id,
|
||||
NodeResp {
|
||||
id: hex::encode(id),
|
||||
draining: true,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut nodes = nodes.into_values().collect::<Vec<_>>();
|
||||
nodes.sort_by(|x, y| x.id.cmp(&y.id));
|
||||
|
||||
let res = GetClusterStatusResponse {
|
||||
node: hex::encode(garage.system.id),
|
||||
garage_version: garage_util::version::garage_version(),
|
||||
garage_features: garage_util::version::garage_features(),
|
||||
rust_version: garage_util::version::rust_version(),
|
||||
db_engine: garage.db.engine(),
|
||||
known_nodes: garage
|
||||
.system
|
||||
.get_known_nodes()
|
||||
.into_iter()
|
||||
.map(|i| KnownNodeResp {
|
||||
id: hex::encode(i.id),
|
||||
addr: i.addr,
|
||||
is_up: i.is_up,
|
||||
last_seen_secs_ago: i.last_seen_secs_ago,
|
||||
hostname: i.status.hostname,
|
||||
})
|
||||
.collect(),
|
||||
layout: format_cluster_layout(&garage.system.get_cluster_layout()),
|
||||
layout_version: layout.current().version,
|
||||
nodes,
|
||||
};
|
||||
|
||||
Ok(json_ok_response(&res)?)
|
||||
|
@ -85,13 +160,14 @@ pub async fn handle_connect_cluster_nodes(
|
|||
}
|
||||
|
||||
pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> {
|
||||
let res = format_cluster_layout(&garage.system.get_cluster_layout());
|
||||
let res = format_cluster_layout(garage.system.cluster_layout().inner());
|
||||
|
||||
Ok(json_ok_response(&res)?)
|
||||
}
|
||||
|
||||
fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResponse {
|
||||
fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResponse {
|
||||
let roles = layout
|
||||
.current()
|
||||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
|
@ -105,10 +181,12 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp
|
|||
.collect::<Vec<_>>();
|
||||
|
||||
let staged_role_changes = layout
|
||||
.staging_roles
|
||||
.staging
|
||||
.get()
|
||||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter(|(k, _, v)| layout.roles.get(k) != Some(v))
|
||||
.filter(|(k, _, v)| layout.current().roles.get(k) != Some(v))
|
||||
.map(|(k, _, v)| match &v.0 {
|
||||
None => NodeRoleChange {
|
||||
id: hex::encode(k),
|
||||
|
@ -126,7 +204,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp
|
|||
.collect::<Vec<_>>();
|
||||
|
||||
GetClusterLayoutResponse {
|
||||
version: layout.version,
|
||||
version: layout.current().version,
|
||||
roles,
|
||||
staged_role_changes,
|
||||
}
|
||||
|
@ -155,8 +233,8 @@ struct GetClusterStatusResponse {
|
|||
garage_features: Option<&'static [&'static str]>,
|
||||
rust_version: &'static str,
|
||||
db_engine: String,
|
||||
known_nodes: Vec<KnownNodeResp>,
|
||||
layout: GetClusterLayoutResponse,
|
||||
layout_version: u64,
|
||||
nodes: Vec<NodeResp>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
@ -190,14 +268,27 @@ struct NodeRoleResp {
|
|||
tags: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct KnownNodeResp {
|
||||
struct FreeSpaceResp {
|
||||
available: u64,
|
||||
total: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct NodeResp {
|
||||
id: String,
|
||||
addr: SocketAddr,
|
||||
role: Option<NodeRoleResp>,
|
||||
addr: Option<SocketAddr>,
|
||||
hostname: Option<String>,
|
||||
is_up: bool,
|
||||
last_seen_secs_ago: Option<u64>,
|
||||
hostname: String,
|
||||
draining: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
data_partition: Option<FreeSpaceResp>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
metadata_partition: Option<FreeSpaceResp>,
|
||||
}
|
||||
|
||||
// ---- update functions ----
|
||||
|
@ -208,10 +299,10 @@ pub async fn handle_update_cluster_layout(
|
|||
) -> Result<Response<ResBody>, Error> {
|
||||
let updates = parse_json_body::<UpdateClusterLayoutRequest, _, Error>(req).await?;
|
||||
|
||||
let mut layout = garage.system.get_cluster_layout();
|
||||
let mut layout = garage.system.cluster_layout().inner().clone();
|
||||
|
||||
let mut roles = layout.roles.clone();
|
||||
roles.merge(&layout.staging_roles);
|
||||
let mut roles = layout.current().roles.clone();
|
||||
roles.merge(&layout.staging.get().roles);
|
||||
|
||||
for change in updates {
|
||||
let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?;
|
||||
|
@ -232,11 +323,17 @@ pub async fn handle_update_cluster_layout(
|
|||
};
|
||||
|
||||
layout
|
||||
.staging_roles
|
||||
.staging
|
||||
.get_mut()
|
||||
.roles
|
||||
.merge(&roles.update_mutator(node, layout::NodeRoleV(new_role)));
|
||||
}
|
||||
|
||||
garage.system.update_cluster_layout(&layout).await?;
|
||||
garage
|
||||
.system
|
||||
.layout_manager
|
||||
.update_cluster_layout(&layout)
|
||||
.await?;
|
||||
|
||||
let res = format_cluster_layout(&layout);
|
||||
Ok(json_ok_response(&res)?)
|
||||
|
@ -246,12 +343,16 @@ pub async fn handle_apply_cluster_layout(
|
|||
garage: &Arc<Garage>,
|
||||
req: Request<IncomingBody>,
|
||||
) -> Result<Response<ResBody>, Error> {
|
||||
let param = parse_json_body::<ApplyRevertLayoutRequest, _, Error>(req).await?;
|
||||
let param = parse_json_body::<ApplyLayoutRequest, _, Error>(req).await?;
|
||||
|
||||
let layout = garage.system.get_cluster_layout();
|
||||
let layout = garage.system.cluster_layout().inner().clone();
|
||||
let (layout, msg) = layout.apply_staged_changes(Some(param.version))?;
|
||||
|
||||
garage.system.update_cluster_layout(&layout).await?;
|
||||
garage
|
||||
.system
|
||||
.layout_manager
|
||||
.update_cluster_layout(&layout)
|
||||
.await?;
|
||||
|
||||
let res = ApplyClusterLayoutResponse {
|
||||
message: msg,
|
||||
|
@ -262,13 +363,14 @@ pub async fn handle_apply_cluster_layout(
|
|||
|
||||
pub async fn handle_revert_cluster_layout(
|
||||
garage: &Arc<Garage>,
|
||||
req: Request<IncomingBody>,
|
||||
) -> Result<Response<ResBody>, Error> {
|
||||
let param = parse_json_body::<ApplyRevertLayoutRequest, _, Error>(req).await?;
|
||||
|
||||
let layout = garage.system.get_cluster_layout();
|
||||
let layout = layout.revert_staged_changes(Some(param.version))?;
|
||||
garage.system.update_cluster_layout(&layout).await?;
|
||||
let layout = garage.system.cluster_layout().inner().clone();
|
||||
let layout = layout.revert_staged_changes()?;
|
||||
garage
|
||||
.system
|
||||
.layout_manager
|
||||
.update_cluster_layout(&layout)
|
||||
.await?;
|
||||
|
||||
let res = format_cluster_layout(&layout);
|
||||
Ok(json_ok_response(&res)?)
|
||||
|
@ -280,7 +382,7 @@ type UpdateClusterLayoutRequest = Vec<NodeRoleChange>;
|
|||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ApplyRevertLayoutRequest {
|
||||
struct ApplyLayoutRequest {
|
||||
version: u64,
|
||||
}
|
||||
|
||||
|
|
|
@ -59,9 +59,7 @@ impl CommonError {
|
|||
pub fn http_status_code(&self) -> StatusCode {
|
||||
match self {
|
||||
CommonError::InternalError(
|
||||
GarageError::Timeout
|
||||
| GarageError::RemoteError(_)
|
||||
| GarageError::Quorum(_, _, _, _),
|
||||
GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..),
|
||||
) => StatusCode::SERVICE_UNAVAILABLE,
|
||||
CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
|
||||
StatusCode::INTERNAL_SERVER_ERROR
|
||||
|
@ -80,9 +78,7 @@ impl CommonError {
|
|||
match self {
|
||||
CommonError::Forbidden(_) => "AccessDenied",
|
||||
CommonError::InternalError(
|
||||
GarageError::Timeout
|
||||
| GarageError::RemoteError(_)
|
||||
| GarageError::Quorum(_, _, _, _),
|
||||
GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..),
|
||||
) => "ServiceUnavailable",
|
||||
CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
|
||||
"InternalError"
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use hyper::Response;
|
||||
use serde::Serialize;
|
||||
|
||||
use garage_rpc::ring::Ring;
|
||||
use garage_table::util::*;
|
||||
|
||||
use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES};
|
||||
|
@ -27,7 +24,11 @@ pub async fn handle_read_index(
|
|||
|
||||
let reverse = reverse.unwrap_or(false);
|
||||
|
||||
let ring: Arc<Ring> = garage.system.ring.borrow().clone();
|
||||
let node_id_vec = garage
|
||||
.system
|
||||
.cluster_layout()
|
||||
.all_nongateway_nodes()
|
||||
.to_vec();
|
||||
|
||||
let (partition_keys, more, next_start) = read_range(
|
||||
&garage.k2v.counter_table.table,
|
||||
|
@ -36,7 +37,7 @@ pub async fn handle_read_index(
|
|||
&start,
|
||||
&end,
|
||||
limit,
|
||||
Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
|
||||
Some((DeletedFilter::NotDeleted, node_id_vec)),
|
||||
EnumerationOrder::from_reverse(reverse),
|
||||
)
|
||||
.await?;
|
||||
|
@ -55,7 +56,7 @@ pub async fn handle_read_index(
|
|||
partition_keys: partition_keys
|
||||
.into_iter()
|
||||
.map(|part| {
|
||||
let vals = part.filtered_values(&ring);
|
||||
let vals = part.filtered_values(&garage.system.cluster_layout());
|
||||
ReadIndexResponseEntry {
|
||||
pk: part.sk,
|
||||
entries: *vals.get(&s_entries).unwrap_or(&0),
|
||||
|
|
|
@ -325,7 +325,7 @@ impl ApiHandler for S3ApiServer {
|
|||
part_number_marker: part_number_marker.map(|p| p.min(10000)),
|
||||
max_parts: max_parts.unwrap_or(1000).clamp(1, 1000),
|
||||
};
|
||||
handle_list_parts(ctx, &query).await
|
||||
handle_list_parts(ctx, req, &query).await
|
||||
}
|
||||
Endpoint::DeleteObjects {} => handle_delete_objects(ctx, req, content_sha256).await,
|
||||
Endpoint::GetBucketWebsite {} => handle_get_website(ctx).await,
|
||||
|
|
406
src/api/s3/checksum.rs
Normal file
406
src/api/s3/checksum.rs
Normal file
|
@ -0,0 +1,406 @@
|
|||
use std::convert::{TryFrom, TryInto};
|
||||
use std::hash::Hasher;
|
||||
|
||||
use base64::prelude::*;
|
||||
use crc32c::Crc32cHasher as Crc32c;
|
||||
use crc32fast::Hasher as Crc32;
|
||||
use md5::{Digest, Md5};
|
||||
use sha1::Sha1;
|
||||
use sha2::Sha256;
|
||||
|
||||
use http::{HeaderMap, HeaderName, HeaderValue};
|
||||
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::OkOrMessage;
|
||||
|
||||
use garage_model::s3::object_table::*;
|
||||
|
||||
use crate::s3::error::*;
|
||||
|
||||
pub const X_AMZ_CHECKSUM_ALGORITHM: HeaderName =
|
||||
HeaderName::from_static("x-amz-checksum-algorithm");
|
||||
pub const X_AMZ_CHECKSUM_MODE: HeaderName = HeaderName::from_static("x-amz-checksum-mode");
|
||||
pub const X_AMZ_CHECKSUM_CRC32: HeaderName = HeaderName::from_static("x-amz-checksum-crc32");
|
||||
pub const X_AMZ_CHECKSUM_CRC32C: HeaderName = HeaderName::from_static("x-amz-checksum-crc32c");
|
||||
pub const X_AMZ_CHECKSUM_SHA1: HeaderName = HeaderName::from_static("x-amz-checksum-sha1");
|
||||
pub const X_AMZ_CHECKSUM_SHA256: HeaderName = HeaderName::from_static("x-amz-checksum-sha256");
|
||||
|
||||
pub type Crc32Checksum = [u8; 4];
|
||||
pub type Crc32cChecksum = [u8; 4];
|
||||
pub type Md5Checksum = [u8; 16];
|
||||
pub type Sha1Checksum = [u8; 20];
|
||||
pub type Sha256Checksum = [u8; 32];
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct ExpectedChecksums {
|
||||
// base64-encoded md5 (content-md5 header)
|
||||
pub md5: Option<String>,
|
||||
// content_sha256 (as a Hash / FixedBytes32)
|
||||
pub sha256: Option<Hash>,
|
||||
// extra x-amz-checksum-* header
|
||||
pub extra: Option<ChecksumValue>,
|
||||
}
|
||||
|
||||
pub(crate) struct Checksummer {
|
||||
pub crc32: Option<Crc32>,
|
||||
pub crc32c: Option<Crc32c>,
|
||||
pub md5: Option<Md5>,
|
||||
pub sha1: Option<Sha1>,
|
||||
pub sha256: Option<Sha256>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct Checksums {
|
||||
pub crc32: Option<Crc32Checksum>,
|
||||
pub crc32c: Option<Crc32cChecksum>,
|
||||
pub md5: Option<Md5Checksum>,
|
||||
pub sha1: Option<Sha1Checksum>,
|
||||
pub sha256: Option<Sha256Checksum>,
|
||||
}
|
||||
|
||||
impl Checksummer {
|
||||
pub(crate) fn init(expected: &ExpectedChecksums, require_md5: bool) -> Self {
|
||||
let mut ret = Self {
|
||||
crc32: None,
|
||||
crc32c: None,
|
||||
md5: None,
|
||||
sha1: None,
|
||||
sha256: None,
|
||||
};
|
||||
|
||||
if expected.md5.is_some() || require_md5 {
|
||||
ret.md5 = Some(Md5::new());
|
||||
}
|
||||
if expected.sha256.is_some() || matches!(&expected.extra, Some(ChecksumValue::Sha256(_))) {
|
||||
ret.sha256 = Some(Sha256::new());
|
||||
}
|
||||
if matches!(&expected.extra, Some(ChecksumValue::Crc32(_))) {
|
||||
ret.crc32 = Some(Crc32::new());
|
||||
}
|
||||
if matches!(&expected.extra, Some(ChecksumValue::Crc32c(_))) {
|
||||
ret.crc32c = Some(Crc32c::default());
|
||||
}
|
||||
if matches!(&expected.extra, Some(ChecksumValue::Sha1(_))) {
|
||||
ret.sha1 = Some(Sha1::new());
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
pub(crate) fn add(mut self, algo: Option<ChecksumAlgorithm>) -> Self {
|
||||
match algo {
|
||||
Some(ChecksumAlgorithm::Crc32) => {
|
||||
self.crc32 = Some(Crc32::new());
|
||||
}
|
||||
Some(ChecksumAlgorithm::Crc32c) => {
|
||||
self.crc32c = Some(Crc32c::default());
|
||||
}
|
||||
Some(ChecksumAlgorithm::Sha1) => {
|
||||
self.sha1 = Some(Sha1::new());
|
||||
}
|
||||
Some(ChecksumAlgorithm::Sha256) => {
|
||||
self.sha256 = Some(Sha256::new());
|
||||
}
|
||||
None => (),
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub(crate) fn update(&mut self, bytes: &[u8]) {
|
||||
if let Some(crc32) = &mut self.crc32 {
|
||||
crc32.update(bytes);
|
||||
}
|
||||
if let Some(crc32c) = &mut self.crc32c {
|
||||
crc32c.write(bytes);
|
||||
}
|
||||
if let Some(md5) = &mut self.md5 {
|
||||
md5.update(bytes);
|
||||
}
|
||||
if let Some(sha1) = &mut self.sha1 {
|
||||
sha1.update(bytes);
|
||||
}
|
||||
if let Some(sha256) = &mut self.sha256 {
|
||||
sha256.update(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn finalize(self) -> Checksums {
|
||||
Checksums {
|
||||
crc32: self.crc32.map(|x| u32::to_be_bytes(x.finalize())),
|
||||
crc32c: self
|
||||
.crc32c
|
||||
.map(|x| u32::to_be_bytes(u32::try_from(x.finish()).unwrap())),
|
||||
md5: self.md5.map(|x| x.finalize()[..].try_into().unwrap()),
|
||||
sha1: self.sha1.map(|x| x.finalize()[..].try_into().unwrap()),
|
||||
sha256: self.sha256.map(|x| x.finalize()[..].try_into().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Checksums {
|
||||
pub fn verify(&self, expected: &ExpectedChecksums) -> Result<(), Error> {
|
||||
if let Some(expected_md5) = &expected.md5 {
|
||||
match self.md5 {
|
||||
Some(md5) if BASE64_STANDARD.encode(&md5) == expected_md5.trim_matches('"') => (),
|
||||
_ => {
|
||||
return Err(Error::InvalidDigest(
|
||||
"MD5 checksum verification failed (from content-md5)".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(expected_sha256) = &expected.sha256 {
|
||||
match self.sha256 {
|
||||
Some(sha256) if &sha256[..] == expected_sha256.as_slice() => (),
|
||||
_ => {
|
||||
return Err(Error::InvalidDigest(
|
||||
"SHA256 checksum verification failed (from x-amz-content-sha256)".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(extra) = expected.extra {
|
||||
let algo = extra.algorithm();
|
||||
if self.extract(Some(algo)) != Some(extra) {
|
||||
return Err(Error::InvalidDigest(format!(
|
||||
"Failed to validate checksum for algorithm {:?}",
|
||||
algo
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn extract(&self, algo: Option<ChecksumAlgorithm>) -> Option<ChecksumValue> {
|
||||
match algo {
|
||||
None => None,
|
||||
Some(ChecksumAlgorithm::Crc32) => Some(ChecksumValue::Crc32(self.crc32.unwrap())),
|
||||
Some(ChecksumAlgorithm::Crc32c) => Some(ChecksumValue::Crc32c(self.crc32c.unwrap())),
|
||||
Some(ChecksumAlgorithm::Sha1) => Some(ChecksumValue::Sha1(self.sha1.unwrap())),
|
||||
Some(ChecksumAlgorithm::Sha256) => Some(ChecksumValue::Sha256(self.sha256.unwrap())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct MultipartChecksummer {
|
||||
pub md5: Md5,
|
||||
pub extra: Option<MultipartExtraChecksummer>,
|
||||
}
|
||||
|
||||
pub(crate) enum MultipartExtraChecksummer {
|
||||
Crc32(Crc32),
|
||||
Crc32c(Crc32c),
|
||||
Sha1(Sha1),
|
||||
Sha256(Sha256),
|
||||
}
|
||||
|
||||
impl MultipartChecksummer {
|
||||
pub(crate) fn init(algo: Option<ChecksumAlgorithm>) -> Self {
|
||||
Self {
|
||||
md5: Md5::new(),
|
||||
extra: match algo {
|
||||
None => None,
|
||||
Some(ChecksumAlgorithm::Crc32) => {
|
||||
Some(MultipartExtraChecksummer::Crc32(Crc32::new()))
|
||||
}
|
||||
Some(ChecksumAlgorithm::Crc32c) => {
|
||||
Some(MultipartExtraChecksummer::Crc32c(Crc32c::default()))
|
||||
}
|
||||
Some(ChecksumAlgorithm::Sha1) => Some(MultipartExtraChecksummer::Sha1(Sha1::new())),
|
||||
Some(ChecksumAlgorithm::Sha256) => {
|
||||
Some(MultipartExtraChecksummer::Sha256(Sha256::new()))
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update(
|
||||
&mut self,
|
||||
etag: &str,
|
||||
checksum: Option<ChecksumValue>,
|
||||
) -> Result<(), Error> {
|
||||
self.md5
|
||||
.update(&hex::decode(&etag).ok_or_message("invalid etag hex")?);
|
||||
match (&mut self.extra, checksum) {
|
||||
(None, _) => (),
|
||||
(
|
||||
Some(MultipartExtraChecksummer::Crc32(ref mut crc32)),
|
||||
Some(ChecksumValue::Crc32(x)),
|
||||
) => {
|
||||
crc32.update(&x);
|
||||
}
|
||||
(
|
||||
Some(MultipartExtraChecksummer::Crc32c(ref mut crc32c)),
|
||||
Some(ChecksumValue::Crc32c(x)),
|
||||
) => {
|
||||
crc32c.write(&x);
|
||||
}
|
||||
(Some(MultipartExtraChecksummer::Sha1(ref mut sha1)), Some(ChecksumValue::Sha1(x))) => {
|
||||
sha1.update(&x);
|
||||
}
|
||||
(
|
||||
Some(MultipartExtraChecksummer::Sha256(ref mut sha256)),
|
||||
Some(ChecksumValue::Sha256(x)),
|
||||
) => {
|
||||
sha256.update(&x);
|
||||
}
|
||||
(Some(_), b) => {
|
||||
return Err(Error::internal_error(format!(
|
||||
"part checksum was not computed correctly, got: {:?}",
|
||||
b
|
||||
)))
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn finalize(self) -> (Md5Checksum, Option<ChecksumValue>) {
|
||||
let md5 = self.md5.finalize()[..].try_into().unwrap();
|
||||
let extra = match self.extra {
|
||||
None => None,
|
||||
Some(MultipartExtraChecksummer::Crc32(crc32)) => {
|
||||
Some(ChecksumValue::Crc32(u32::to_be_bytes(crc32.finalize())))
|
||||
}
|
||||
Some(MultipartExtraChecksummer::Crc32c(crc32c)) => Some(ChecksumValue::Crc32c(
|
||||
u32::to_be_bytes(u32::try_from(crc32c.finish()).unwrap()),
|
||||
)),
|
||||
Some(MultipartExtraChecksummer::Sha1(sha1)) => {
|
||||
Some(ChecksumValue::Sha1(sha1.finalize()[..].try_into().unwrap()))
|
||||
}
|
||||
Some(MultipartExtraChecksummer::Sha256(sha256)) => Some(ChecksumValue::Sha256(
|
||||
sha256.finalize()[..].try_into().unwrap(),
|
||||
)),
|
||||
};
|
||||
(md5, extra)
|
||||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
|
||||
/// Extract the value of the x-amz-checksum-algorithm header
|
||||
pub(crate) fn request_checksum_algorithm(
|
||||
headers: &HeaderMap<HeaderValue>,
|
||||
) -> Result<Option<ChecksumAlgorithm>, Error> {
|
||||
match headers.get(X_AMZ_CHECKSUM_ALGORITHM) {
|
||||
None => Ok(None),
|
||||
Some(x) if x == "CRC32" => Ok(Some(ChecksumAlgorithm::Crc32)),
|
||||
Some(x) if x == "CRC32C" => Ok(Some(ChecksumAlgorithm::Crc32c)),
|
||||
Some(x) if x == "SHA1" => Ok(Some(ChecksumAlgorithm::Sha1)),
|
||||
Some(x) if x == "SHA256" => Ok(Some(ChecksumAlgorithm::Sha256)),
|
||||
_ => Err(Error::bad_request("invalid checksum algorithm")),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the value of any of the x-amz-checksum-* headers
|
||||
pub(crate) fn request_checksum_value(
|
||||
headers: &HeaderMap<HeaderValue>,
|
||||
) -> Result<Option<ChecksumValue>, Error> {
|
||||
let mut ret = vec![];
|
||||
|
||||
if let Some(crc32_str) = headers.get(X_AMZ_CHECKSUM_CRC32) {
|
||||
let crc32 = BASE64_STANDARD
|
||||
.decode(&crc32_str)
|
||||
.ok()
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-crc32 header")?;
|
||||
ret.push(ChecksumValue::Crc32(crc32))
|
||||
}
|
||||
if let Some(crc32c_str) = headers.get(X_AMZ_CHECKSUM_CRC32C) {
|
||||
let crc32c = BASE64_STANDARD
|
||||
.decode(&crc32c_str)
|
||||
.ok()
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-crc32c header")?;
|
||||
ret.push(ChecksumValue::Crc32c(crc32c))
|
||||
}
|
||||
if let Some(sha1_str) = headers.get(X_AMZ_CHECKSUM_SHA1) {
|
||||
let sha1 = BASE64_STANDARD
|
||||
.decode(&sha1_str)
|
||||
.ok()
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-sha1 header")?;
|
||||
ret.push(ChecksumValue::Sha1(sha1))
|
||||
}
|
||||
if let Some(sha256_str) = headers.get(X_AMZ_CHECKSUM_SHA256) {
|
||||
let sha256 = BASE64_STANDARD
|
||||
.decode(&sha256_str)
|
||||
.ok()
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-sha256 header")?;
|
||||
ret.push(ChecksumValue::Sha256(sha256))
|
||||
}
|
||||
|
||||
if ret.len() > 1 {
|
||||
return Err(Error::bad_request(
|
||||
"multiple x-amz-checksum-* headers given",
|
||||
));
|
||||
}
|
||||
Ok(ret.pop())
|
||||
}
|
||||
|
||||
/// Checks for the presense of x-amz-checksum-algorithm
|
||||
/// if so extract the corrseponding x-amz-checksum-* value
|
||||
pub(crate) fn request_checksum_algorithm_value(
|
||||
headers: &HeaderMap<HeaderValue>,
|
||||
) -> Result<Option<ChecksumValue>, Error> {
|
||||
match headers.get(X_AMZ_CHECKSUM_ALGORITHM) {
|
||||
Some(x) if x == "CRC32" => {
|
||||
let crc32 = headers
|
||||
.get(X_AMZ_CHECKSUM_CRC32)
|
||||
.and_then(|x| BASE64_STANDARD.decode(&x).ok())
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-crc32 header")?;
|
||||
Ok(Some(ChecksumValue::Crc32(crc32)))
|
||||
}
|
||||
Some(x) if x == "CRC32C" => {
|
||||
let crc32c = headers
|
||||
.get(X_AMZ_CHECKSUM_CRC32C)
|
||||
.and_then(|x| BASE64_STANDARD.decode(&x).ok())
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-crc32c header")?;
|
||||
Ok(Some(ChecksumValue::Crc32c(crc32c)))
|
||||
}
|
||||
Some(x) if x == "SHA1" => {
|
||||
let sha1 = headers
|
||||
.get(X_AMZ_CHECKSUM_SHA1)
|
||||
.and_then(|x| BASE64_STANDARD.decode(&x).ok())
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-sha1 header")?;
|
||||
Ok(Some(ChecksumValue::Sha1(sha1)))
|
||||
}
|
||||
Some(x) if x == "SHA256" => {
|
||||
let sha256 = headers
|
||||
.get(X_AMZ_CHECKSUM_SHA256)
|
||||
.and_then(|x| BASE64_STANDARD.decode(&x).ok())
|
||||
.and_then(|x| x.try_into().ok())
|
||||
.ok_or_bad_request("invalid x-amz-checksum-sha256 header")?;
|
||||
Ok(Some(ChecksumValue::Sha256(sha256)))
|
||||
}
|
||||
Some(_) => Err(Error::bad_request("invalid x-amz-checksum-algorithm")),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn add_checksum_response_headers(
|
||||
checksum: &Option<ChecksumValue>,
|
||||
mut resp: http::response::Builder,
|
||||
) -> http::response::Builder {
|
||||
match checksum {
|
||||
Some(ChecksumValue::Crc32(crc32)) => {
|
||||
resp = resp.header(X_AMZ_CHECKSUM_CRC32, BASE64_STANDARD.encode(&crc32));
|
||||
}
|
||||
Some(ChecksumValue::Crc32c(crc32c)) => {
|
||||
resp = resp.header(X_AMZ_CHECKSUM_CRC32C, BASE64_STANDARD.encode(&crc32c));
|
||||
}
|
||||
Some(ChecksumValue::Sha1(sha1)) => {
|
||||
resp = resp.header(X_AMZ_CHECKSUM_SHA1, BASE64_STANDARD.encode(&sha1));
|
||||
}
|
||||
Some(ChecksumValue::Sha256(sha256)) => {
|
||||
resp = resp.header(X_AMZ_CHECKSUM_SHA256, BASE64_STANDARD.encode(&sha256));
|
||||
}
|
||||
None => (),
|
||||
}
|
||||
resp
|
||||
}
|
|
@ -1,17 +1,18 @@
|
|||
use std::pin::Pin;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use futures::{stream, stream::Stream, StreamExt};
|
||||
use md5::{Digest as Md5Digest, Md5};
|
||||
use futures::{stream, stream::Stream, StreamExt, TryStreamExt};
|
||||
|
||||
use bytes::Bytes;
|
||||
use hyper::{Request, Response};
|
||||
use serde::Serialize;
|
||||
|
||||
use garage_net::bytes_buf::BytesBuf;
|
||||
use garage_net::stream::read_stream_to_end;
|
||||
use garage_rpc::rpc_helper::OrderTag;
|
||||
use garage_table::*;
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::Error as GarageError;
|
||||
use garage_util::time::*;
|
||||
|
||||
use garage_model::s3::block_ref_table::*;
|
||||
|
@ -21,11 +22,16 @@ use garage_model::s3::version_table::*;
|
|||
|
||||
use crate::helpers::*;
|
||||
use crate::s3::api_server::{ReqBody, ResBody};
|
||||
use crate::s3::checksum::*;
|
||||
use crate::s3::encryption::EncryptionParams;
|
||||
use crate::s3::error::*;
|
||||
use crate::s3::get::full_object_byte_stream;
|
||||
use crate::s3::multipart;
|
||||
use crate::s3::put::get_headers;
|
||||
use crate::s3::put::{get_headers, save_stream, ChecksumMode, SaveStreamResult};
|
||||
use crate::s3::xml::{self as s3_xml, xmlns_tag};
|
||||
|
||||
// -------- CopyObject ---------
|
||||
|
||||
pub async fn handle_copy(
|
||||
ctx: ReqCtx,
|
||||
req: &Request<ReqBody>,
|
||||
|
@ -33,13 +39,9 @@ pub async fn handle_copy(
|
|||
) -> Result<Response<ResBody>, Error> {
|
||||
let copy_precondition = CopyPreconditionHeaders::parse(req)?;
|
||||
|
||||
let source_object = get_copy_source(&ctx, req).await?;
|
||||
let checksum_algorithm = request_checksum_algorithm(req.headers())?;
|
||||
|
||||
let ReqCtx {
|
||||
garage,
|
||||
bucket_id: dest_bucket_id,
|
||||
..
|
||||
} = ctx;
|
||||
let source_object = get_copy_source(&ctx, req).await?;
|
||||
|
||||
let (source_version, source_version_data, source_version_meta) =
|
||||
extract_source_info(&source_object)?;
|
||||
|
@ -47,26 +49,150 @@ pub async fn handle_copy(
|
|||
// Check precondition, e.g. x-amz-copy-source-if-match
|
||||
copy_precondition.check(source_version, &source_version_meta.etag)?;
|
||||
|
||||
// Determine encryption parameters
|
||||
let (source_encryption, source_object_meta_inner) =
|
||||
EncryptionParams::check_decrypt_for_copy_source(
|
||||
&ctx.garage,
|
||||
req.headers(),
|
||||
&source_version_meta.encryption,
|
||||
)?;
|
||||
let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?;
|
||||
|
||||
// Extract source checksum info before source_object_meta_inner is consumed
|
||||
let source_checksum = source_object_meta_inner.checksum;
|
||||
let source_checksum_algorithm = source_checksum.map(|x| x.algorithm());
|
||||
|
||||
// If source object has a checksum, the destination object must as well.
|
||||
// The x-amz-checksum-algorihtm header allows to change that algorithm,
|
||||
// but if it is absent, we must use the same as before
|
||||
let checksum_algorithm = checksum_algorithm.or(source_checksum_algorithm);
|
||||
|
||||
// Determine metadata of destination object
|
||||
let was_multipart = source_version_meta.etag.contains('-');
|
||||
let dest_object_meta = ObjectVersionMetaInner {
|
||||
headers: match req.headers().get("x-amz-metadata-directive") {
|
||||
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => {
|
||||
get_headers(req.headers())?
|
||||
}
|
||||
_ => source_object_meta_inner.into_owned().headers,
|
||||
},
|
||||
checksum: source_checksum,
|
||||
};
|
||||
|
||||
// Do actual object copying
|
||||
//
|
||||
// In any of the following scenarios, we need to read the whole object
|
||||
// data and re-write it again:
|
||||
//
|
||||
// - the data needs to be decrypted or encrypted
|
||||
// - the requested checksum algorithm requires us to recompute a checksum
|
||||
// - the original object was a multipart upload and a checksum algorithm
|
||||
// is defined (AWS specifies that in this case, we must recompute the
|
||||
// checksum from scratch as if this was a single big object and not
|
||||
// a multipart object, as the checksums are not computed in the same way)
|
||||
//
|
||||
// In other cases, we can just copy the metadata and reference the same blocks.
|
||||
//
|
||||
// See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html
|
||||
|
||||
let must_recopy = !EncryptionParams::is_same(&source_encryption, &dest_encryption)
|
||||
|| source_checksum_algorithm != checksum_algorithm
|
||||
|| (was_multipart && checksum_algorithm.is_some());
|
||||
|
||||
let res = if !must_recopy {
|
||||
// In most cases, we can just copy the metadata and link blocks of the
|
||||
// old object from the new object.
|
||||
handle_copy_metaonly(
|
||||
ctx,
|
||||
dest_key,
|
||||
dest_object_meta,
|
||||
dest_encryption,
|
||||
source_version,
|
||||
source_version_data,
|
||||
source_version_meta,
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
let expected_checksum = ExpectedChecksums {
|
||||
md5: None,
|
||||
sha256: None,
|
||||
extra: source_checksum,
|
||||
};
|
||||
let checksum_mode = if was_multipart || source_checksum_algorithm != checksum_algorithm {
|
||||
ChecksumMode::Calculate(checksum_algorithm)
|
||||
} else {
|
||||
ChecksumMode::Verify(&expected_checksum)
|
||||
};
|
||||
// If source and dest encryption use different keys,
|
||||
// we must decrypt content and re-encrypt, so rewrite all data blocks.
|
||||
handle_copy_reencrypt(
|
||||
ctx,
|
||||
dest_key,
|
||||
dest_object_meta,
|
||||
dest_encryption,
|
||||
source_version,
|
||||
source_version_data,
|
||||
source_encryption,
|
||||
checksum_mode,
|
||||
)
|
||||
.await?
|
||||
};
|
||||
|
||||
let last_modified = msec_to_rfc3339(res.version_timestamp);
|
||||
let result = CopyObjectResult {
|
||||
last_modified: s3_xml::Value(last_modified),
|
||||
etag: s3_xml::Value(format!("\"{}\"", res.etag)),
|
||||
};
|
||||
let xml = s3_xml::to_xml_with_header(&result)?;
|
||||
|
||||
let mut resp = Response::builder()
|
||||
.header("Content-Type", "application/xml")
|
||||
.header("x-amz-version-id", hex::encode(res.version_uuid))
|
||||
.header(
|
||||
"x-amz-copy-source-version-id",
|
||||
hex::encode(source_version.uuid),
|
||||
);
|
||||
dest_encryption.add_response_headers(&mut resp);
|
||||
Ok(resp.body(string_body(xml))?)
|
||||
}
|
||||
|
||||
async fn handle_copy_metaonly(
|
||||
ctx: ReqCtx,
|
||||
dest_key: &str,
|
||||
dest_object_meta: ObjectVersionMetaInner,
|
||||
dest_encryption: EncryptionParams,
|
||||
source_version: &ObjectVersion,
|
||||
source_version_data: &ObjectVersionData,
|
||||
source_version_meta: &ObjectVersionMeta,
|
||||
) -> Result<SaveStreamResult, Error> {
|
||||
let ReqCtx {
|
||||
garage,
|
||||
bucket_id: dest_bucket_id,
|
||||
..
|
||||
} = ctx;
|
||||
|
||||
// Generate parameters for copied object
|
||||
let new_uuid = gen_uuid();
|
||||
let new_timestamp = now_msec();
|
||||
|
||||
// Implement x-amz-metadata-directive: REPLACE
|
||||
let new_meta = match req.headers().get("x-amz-metadata-directive") {
|
||||
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta {
|
||||
headers: get_headers(req.headers())?,
|
||||
let new_meta = ObjectVersionMeta {
|
||||
encryption: dest_encryption.encrypt_meta(dest_object_meta)?,
|
||||
size: source_version_meta.size,
|
||||
etag: source_version_meta.etag.clone(),
|
||||
},
|
||||
_ => source_version_meta.clone(),
|
||||
};
|
||||
|
||||
let etag = new_meta.etag.to_string();
|
||||
let res = SaveStreamResult {
|
||||
version_uuid: new_uuid,
|
||||
version_timestamp: new_timestamp,
|
||||
etag: new_meta.etag.clone(),
|
||||
};
|
||||
|
||||
// Save object copy
|
||||
match source_version_data {
|
||||
ObjectVersionData::DeleteMarker => unreachable!(),
|
||||
ObjectVersionData::Inline(_meta, bytes) => {
|
||||
// bytes is either plaintext before&after or encrypted with the
|
||||
// same keys, so it's ok to just copy it as is
|
||||
let dest_object_version = ObjectVersion {
|
||||
uuid: new_uuid,
|
||||
timestamp: new_timestamp,
|
||||
|
@ -97,7 +223,8 @@ pub async fn handle_copy(
|
|||
uuid: new_uuid,
|
||||
timestamp: new_timestamp,
|
||||
state: ObjectVersionState::Uploading {
|
||||
headers: new_meta.headers.clone(),
|
||||
encryption: new_meta.encryption.clone(),
|
||||
checksum_algorithm: None,
|
||||
multipart: false,
|
||||
},
|
||||
};
|
||||
|
@ -164,23 +291,42 @@ pub async fn handle_copy(
|
|||
}
|
||||
}
|
||||
|
||||
let last_modified = msec_to_rfc3339(new_timestamp);
|
||||
let result = CopyObjectResult {
|
||||
last_modified: s3_xml::Value(last_modified),
|
||||
etag: s3_xml::Value(format!("\"{}\"", etag)),
|
||||
};
|
||||
let xml = s3_xml::to_xml_with_header(&result)?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.header("Content-Type", "application/xml")
|
||||
.header("x-amz-version-id", hex::encode(new_uuid))
|
||||
.header(
|
||||
"x-amz-copy-source-version-id",
|
||||
hex::encode(source_version.uuid),
|
||||
)
|
||||
.body(string_body(xml))?)
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn handle_copy_reencrypt(
|
||||
ctx: ReqCtx,
|
||||
dest_key: &str,
|
||||
dest_object_meta: ObjectVersionMetaInner,
|
||||
dest_encryption: EncryptionParams,
|
||||
source_version: &ObjectVersion,
|
||||
source_version_data: &ObjectVersionData,
|
||||
source_encryption: EncryptionParams,
|
||||
checksum_mode: ChecksumMode<'_>,
|
||||
) -> Result<SaveStreamResult, Error> {
|
||||
// basically we will read the source data (decrypt if necessary)
|
||||
// and save that in a new object (encrypt if necessary),
|
||||
// by combining the code used in getobject and putobject
|
||||
let source_stream = full_object_byte_stream(
|
||||
ctx.garage.clone(),
|
||||
source_version,
|
||||
source_version_data,
|
||||
source_encryption,
|
||||
);
|
||||
|
||||
save_stream(
|
||||
&ctx,
|
||||
dest_object_meta,
|
||||
dest_encryption,
|
||||
source_stream.map_err(|e| Error::from(GarageError::from(e))),
|
||||
&dest_key.to_string(),
|
||||
checksum_mode,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
// -------- UploadPartCopy ---------
|
||||
|
||||
pub async fn handle_upload_part_copy(
|
||||
ctx: ReqCtx,
|
||||
req: &Request<ReqBody>,
|
||||
|
@ -193,7 +339,7 @@ pub async fn handle_upload_part_copy(
|
|||
let dest_upload_id = multipart::decode_upload_id(upload_id)?;
|
||||
|
||||
let dest_key = dest_key.to_string();
|
||||
let (source_object, (_, _, mut dest_mpu)) = futures::try_join!(
|
||||
let (source_object, (_, dest_version, mut dest_mpu)) = futures::try_join!(
|
||||
get_copy_source(&ctx, req),
|
||||
multipart::get_upload(&ctx, &dest_key, &dest_upload_id)
|
||||
)?;
|
||||
|
@ -206,6 +352,24 @@ pub async fn handle_upload_part_copy(
|
|||
// Check precondition on source, e.g. x-amz-copy-source-if-match
|
||||
copy_precondition.check(source_object_version, &source_version_meta.etag)?;
|
||||
|
||||
// Determine encryption parameters
|
||||
let (source_encryption, _) = EncryptionParams::check_decrypt_for_copy_source(
|
||||
&garage,
|
||||
req.headers(),
|
||||
&source_version_meta.encryption,
|
||||
)?;
|
||||
let (dest_object_encryption, dest_object_checksum_algorithm) = match dest_version.state {
|
||||
ObjectVersionState::Uploading {
|
||||
encryption,
|
||||
checksum_algorithm,
|
||||
..
|
||||
} => (encryption, checksum_algorithm),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let (dest_encryption, _) =
|
||||
EncryptionParams::check_decrypt(&garage, req.headers(), &dest_object_encryption)?;
|
||||
let same_encryption = EncryptionParams::is_same(&source_encryption, &dest_encryption);
|
||||
|
||||
// Check source range is valid
|
||||
let source_range = match req.headers().get("x-amz-copy-source-range") {
|
||||
Some(range) => {
|
||||
|
@ -227,9 +391,7 @@ pub async fn handle_upload_part_copy(
|
|||
};
|
||||
|
||||
// Check source version is not inlined
|
||||
match source_version_data {
|
||||
ObjectVersionData::DeleteMarker => unreachable!(),
|
||||
ObjectVersionData::Inline(_meta, _bytes) => {
|
||||
if matches!(source_version_data, ObjectVersionData::Inline(_, _)) {
|
||||
// This is only for small files, we don't bother handling this.
|
||||
// (in AWS UploadPartCopy works for parts at least 5MB which
|
||||
// is never the case of an inline object)
|
||||
|
@ -237,11 +399,8 @@ pub async fn handle_upload_part_copy(
|
|||
"Source object is too small (minimum part size is 5Mb)",
|
||||
));
|
||||
}
|
||||
ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (),
|
||||
};
|
||||
|
||||
// Fetch source versin with its block list,
|
||||
// and destination version to check part hasn't yet been uploaded
|
||||
// Fetch source version with its block list
|
||||
let source_version = garage
|
||||
.version_table
|
||||
.get(&source_object_version.uuid, &EmptyKey)
|
||||
|
@ -251,7 +410,9 @@ pub async fn handle_upload_part_copy(
|
|||
// We want to reuse blocks from the source version as much as possible.
|
||||
// However, we still need to get the data from these blocks
|
||||
// because we need to know it to calculate the MD5sum of the part
|
||||
// which is used as its ETag.
|
||||
// which is used as its ETag. For encrypted sources or destinations,
|
||||
// we must always read(+decrypt) and then write(+encrypt), so we
|
||||
// can never reuse data blocks as is.
|
||||
|
||||
// First, calculate what blocks we want to keep,
|
||||
// and the subrange of the block to take, if the bounds of the
|
||||
|
@ -300,7 +461,9 @@ pub async fn handle_upload_part_copy(
|
|||
dest_mpu_part_key,
|
||||
MpuPart {
|
||||
version: dest_version_id,
|
||||
// These are all filled in later (bottom of this function)
|
||||
etag: None,
|
||||
checksum: None,
|
||||
size: None,
|
||||
},
|
||||
);
|
||||
|
@ -313,32 +476,55 @@ pub async fn handle_upload_part_copy(
|
|||
},
|
||||
false,
|
||||
);
|
||||
// write an empty version now to be the parent of the block_ref entries
|
||||
garage.version_table.insert(&dest_version).await?;
|
||||
|
||||
// Now, actually copy the blocks
|
||||
let mut md5hasher = Md5::new();
|
||||
let mut checksummer = Checksummer::init(&Default::default(), !dest_encryption.is_encrypted())
|
||||
.add(dest_object_checksum_algorithm);
|
||||
|
||||
// First, create a stream that is able to read the source blocks
|
||||
// and extract the subrange if necessary.
|
||||
// The second returned value is an Option<Hash>, that is Some
|
||||
// if and only if the block returned is a block that already existed
|
||||
// in the Garage data store (thus we don't need to save it again).
|
||||
// in the Garage data store and can be reused as-is instead of having
|
||||
// to save it again. This excludes encrypted source blocks that we had
|
||||
// to decrypt.
|
||||
let garage2 = garage.clone();
|
||||
let order_stream = OrderTag::stream();
|
||||
let source_blocks = stream::iter(blocks_to_copy)
|
||||
.enumerate()
|
||||
.flat_map(|(i, (block_hash, range_to_copy))| {
|
||||
.map(|(i, (block_hash, range_to_copy))| {
|
||||
let garage3 = garage2.clone();
|
||||
stream::once(async move {
|
||||
let data = garage3
|
||||
.block_manager
|
||||
.rpc_get_block(&block_hash, Some(order_stream.order(i as u64)))
|
||||
async move {
|
||||
let stream = source_encryption
|
||||
.get_block(&garage3, &block_hash, Some(order_stream.order(i as u64)))
|
||||
.await?;
|
||||
let data = read_stream_to_end(stream).await?.into_bytes();
|
||||
// For each item, we return a tuple of:
|
||||
// 1. the full data block (decrypted)
|
||||
// 2. an Option<Hash> that indicates the hash of the block in the block store,
|
||||
// only if it can be re-used as-is in the copied object
|
||||
match range_to_copy {
|
||||
Some(r) => Ok((data.slice(r), None)),
|
||||
None => Ok((data, Some(block_hash))),
|
||||
Some(r) => {
|
||||
// If we are taking a subslice of the data, we cannot reuse the block as-is
|
||||
Ok((data.slice(r), None))
|
||||
}
|
||||
None if same_encryption => {
|
||||
// If the data is unencrypted before & after, or if we are using
|
||||
// the same encryption key, we can reuse the stored block, no need
|
||||
// to re-send it to storage nodes.
|
||||
Ok((data, Some(block_hash)))
|
||||
}
|
||||
None => {
|
||||
// If we are decrypting / (re)encrypting with different keys,
|
||||
// we cannot reuse the block as-is
|
||||
Ok((data, None))
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.buffered(2)
|
||||
.peekable();
|
||||
|
||||
// The defragmenter is a custom stream (defined below) that concatenates
|
||||
|
@ -346,22 +532,39 @@ pub async fn handle_upload_part_copy(
|
|||
// It returns a series of (Vec<u8>, Option<Hash>).
|
||||
// When it is done, it returns an empty vec.
|
||||
// Same as the previous iterator, the Option is Some(_) if and only if
|
||||
// it's an existing block of the Garage data store.
|
||||
// it's an existing block of the Garage data store that can be reused.
|
||||
let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks));
|
||||
|
||||
let mut current_offset = 0;
|
||||
let mut next_block = defragmenter.next().await?;
|
||||
|
||||
// TODO this could be optimized similarly to read_and_put_blocks
|
||||
// low priority because uploadpartcopy is rarely used
|
||||
loop {
|
||||
let (data, existing_block_hash) = next_block;
|
||||
if data.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
md5hasher.update(&data[..]);
|
||||
let data_len = data.len() as u64;
|
||||
|
||||
let must_upload = existing_block_hash.is_none();
|
||||
let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..]));
|
||||
let (checksummer_updated, (data_to_upload, final_hash)) =
|
||||
tokio::task::spawn_blocking(move || {
|
||||
checksummer.update(&data[..]);
|
||||
|
||||
let tup = match existing_block_hash {
|
||||
Some(hash) if same_encryption => (None, hash),
|
||||
_ => {
|
||||
let data_enc = dest_encryption.encrypt_block(data)?;
|
||||
let hash = blake2sum(&data_enc);
|
||||
(Some(data_enc), hash)
|
||||
}
|
||||
};
|
||||
Ok::<_, Error>((checksummer, tup))
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
checksummer = checksummer_updated;
|
||||
|
||||
dest_version.blocks.clear();
|
||||
dest_version.blocks.put(
|
||||
|
@ -371,10 +574,10 @@ pub async fn handle_upload_part_copy(
|
|||
},
|
||||
VersionBlock {
|
||||
hash: final_hash,
|
||||
size: data.len() as u64,
|
||||
size: data_len,
|
||||
},
|
||||
);
|
||||
current_offset += data.len() as u64;
|
||||
current_offset += data_len;
|
||||
|
||||
let block_ref = BlockRef {
|
||||
block: final_hash,
|
||||
|
@ -382,36 +585,34 @@ pub async fn handle_upload_part_copy(
|
|||
deleted: false.into(),
|
||||
};
|
||||
|
||||
let garage2 = garage.clone();
|
||||
let res = futures::try_join!(
|
||||
let (_, _, _, next) = futures::try_join!(
|
||||
// Thing 1: if the block is not exactly a block that existed before,
|
||||
// we need to insert that data as a new block.
|
||||
async move {
|
||||
if must_upload {
|
||||
garage2
|
||||
async {
|
||||
if let Some(final_data) = data_to_upload {
|
||||
garage
|
||||
.block_manager
|
||||
.rpc_put_block(final_hash, data, None)
|
||||
.rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None)
|
||||
.await
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
async {
|
||||
// Thing 2: we need to insert the block in the version
|
||||
garage.version_table.insert(&dest_version).await?;
|
||||
garage.version_table.insert(&dest_version),
|
||||
// Thing 3: we need to add a block reference
|
||||
garage.block_ref_table.insert(&block_ref).await
|
||||
},
|
||||
// Thing 4: we need to prefetch the next block
|
||||
garage.block_ref_table.insert(&block_ref),
|
||||
// Thing 4: we need to read the next block
|
||||
defragmenter.next(),
|
||||
)?;
|
||||
next_block = res.2;
|
||||
next_block = next;
|
||||
}
|
||||
|
||||
assert_eq!(current_offset, source_range.length);
|
||||
|
||||
let data_md5sum = md5hasher.finalize();
|
||||
let etag = hex::encode(data_md5sum);
|
||||
let checksums = checksummer.finalize();
|
||||
let etag = dest_encryption.etag_from_md5(&checksums.md5);
|
||||
let checksum = checksums.extract(dest_object_checksum_algorithm);
|
||||
|
||||
// Put the part's ETag in the Versiontable
|
||||
dest_mpu.parts.put(
|
||||
|
@ -419,6 +620,7 @@ pub async fn handle_upload_part_copy(
|
|||
MpuPart {
|
||||
version: dest_version_id,
|
||||
etag: Some(etag.clone()),
|
||||
checksum,
|
||||
size: Some(current_offset),
|
||||
},
|
||||
);
|
||||
|
@ -431,13 +633,14 @@ pub async fn handle_upload_part_copy(
|
|||
last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)),
|
||||
})?;
|
||||
|
||||
Ok(Response::builder()
|
||||
let mut resp = Response::builder()
|
||||
.header("Content-Type", "application/xml")
|
||||
.header(
|
||||
"x-amz-copy-source-version-id",
|
||||
hex::encode(source_object_version.uuid),
|
||||
)
|
||||
.body(string_body(resp_xml))?)
|
||||
);
|
||||
dest_encryption.add_response_headers(&mut resp);
|
||||
Ok(resp.body(string_body(resp_xml))?)
|
||||
}
|
||||
|
||||
async fn get_copy_source(ctx: &ReqCtx, req: &Request<ReqBody>) -> Result<Object, Error> {
|
||||
|
|
595
src/api/s3/encryption.rs
Normal file
595
src/api/s3/encryption.rs
Normal file
|
@ -0,0 +1,595 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::pin::Pin;
|
||||
|
||||
use aes_gcm::{
|
||||
aead::stream::{DecryptorLE31, EncryptorLE31, StreamLE31},
|
||||
aead::{Aead, AeadCore, KeyInit, OsRng},
|
||||
aes::cipher::crypto_common::rand_core::RngCore,
|
||||
aes::cipher::typenum::Unsigned,
|
||||
Aes256Gcm, Key, Nonce,
|
||||
};
|
||||
use base64::prelude::*;
|
||||
use bytes::Bytes;
|
||||
|
||||
use futures::stream::Stream;
|
||||
use futures::task;
|
||||
use tokio::io::BufReader;
|
||||
|
||||
use http::header::{HeaderMap, HeaderName, HeaderValue};
|
||||
|
||||
use garage_net::bytes_buf::BytesBuf;
|
||||
use garage_net::stream::{stream_asyncread, ByteStream};
|
||||
use garage_rpc::rpc_helper::OrderTag;
|
||||
use garage_util::data::Hash;
|
||||
use garage_util::error::Error as GarageError;
|
||||
use garage_util::migrate::Migrate;
|
||||
|
||||
use garage_model::garage::Garage;
|
||||
use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionMetaInner};
|
||||
|
||||
use crate::common_error::*;
|
||||
use crate::s3::checksum::Md5Checksum;
|
||||
use crate::s3::error::Error;
|
||||
|
||||
const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName =
|
||||
HeaderName::from_static("x-amz-server-side-encryption-customer-algorithm");
|
||||
const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName =
|
||||
HeaderName::from_static("x-amz-server-side-encryption-customer-key");
|
||||
const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName =
|
||||
HeaderName::from_static("x-amz-server-side-encryption-customer-key-md5");
|
||||
|
||||
const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName =
|
||||
HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-algorithm");
|
||||
const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName =
|
||||
HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key");
|
||||
const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName =
|
||||
HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key-md5");
|
||||
|
||||
const CUSTOMER_ALGORITHM_AES256: &[u8] = b"AES256";
|
||||
|
||||
type Md5Output = md5::digest::Output<md5::Md5Core>;
|
||||
|
||||
type StreamNonceSize = aes_gcm::aead::stream::NonceSize<Aes256Gcm, StreamLE31<Aes256Gcm>>;
|
||||
|
||||
// Data blocks are encrypted by smaller chunks of size 4096 bytes,
|
||||
// so that data can be streamed when reading.
|
||||
// This size has to be known and has to be constant, or data won't be
|
||||
// readable anymore. DO NOT CHANGE THIS VALUE.
|
||||
const STREAM_ENC_PLAIN_CHUNK_SIZE: usize = 0x1000; // 4096 bytes
|
||||
const STREAM_ENC_CYPER_CHUNK_SIZE: usize = STREAM_ENC_PLAIN_CHUNK_SIZE + 16;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum EncryptionParams {
|
||||
Plaintext,
|
||||
SseC {
|
||||
client_key: Key<Aes256Gcm>,
|
||||
client_key_md5: Md5Output,
|
||||
compression_level: Option<i32>,
|
||||
},
|
||||
}
|
||||
|
||||
impl EncryptionParams {
|
||||
pub fn is_encrypted(&self) -> bool {
|
||||
!matches!(self, Self::Plaintext)
|
||||
}
|
||||
|
||||
pub fn is_same(a: &Self, b: &Self) -> bool {
|
||||
let relevant_info = |x: &Self| match x {
|
||||
Self::Plaintext => None,
|
||||
Self::SseC {
|
||||
client_key,
|
||||
compression_level,
|
||||
..
|
||||
} => Some((*client_key, compression_level.is_some())),
|
||||
};
|
||||
relevant_info(a) == relevant_info(b)
|
||||
}
|
||||
|
||||
pub fn new_from_headers(
|
||||
garage: &Garage,
|
||||
headers: &HeaderMap,
|
||||
) -> Result<EncryptionParams, Error> {
|
||||
let key = parse_request_headers(
|
||||
headers,
|
||||
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
|
||||
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
|
||||
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
|
||||
)?;
|
||||
match key {
|
||||
Some((client_key, client_key_md5)) => Ok(EncryptionParams::SseC {
|
||||
client_key,
|
||||
client_key_md5,
|
||||
compression_level: garage.config.compression_level,
|
||||
}),
|
||||
None => Ok(EncryptionParams::Plaintext),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_response_headers(&self, resp: &mut http::response::Builder) {
|
||||
if let Self::SseC { client_key_md5, .. } = self {
|
||||
let md5 = BASE64_STANDARD.encode(&client_key_md5);
|
||||
|
||||
resp.headers_mut().unwrap().insert(
|
||||
X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
|
||||
HeaderValue::from_bytes(CUSTOMER_ALGORITHM_AES256).unwrap(),
|
||||
);
|
||||
resp.headers_mut().unwrap().insert(
|
||||
X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
|
||||
HeaderValue::from_bytes(md5.as_bytes()).unwrap(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_decrypt<'a>(
|
||||
garage: &Garage,
|
||||
headers: &HeaderMap,
|
||||
obj_enc: &'a ObjectVersionEncryption,
|
||||
) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> {
|
||||
let key = parse_request_headers(
|
||||
headers,
|
||||
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
|
||||
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
|
||||
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
|
||||
)?;
|
||||
Self::check_decrypt_common(garage, key, obj_enc)
|
||||
}
|
||||
|
||||
pub fn check_decrypt_for_copy_source<'a>(
|
||||
garage: &Garage,
|
||||
headers: &HeaderMap,
|
||||
obj_enc: &'a ObjectVersionEncryption,
|
||||
) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> {
|
||||
let key = parse_request_headers(
|
||||
headers,
|
||||
&X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
|
||||
&X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
|
||||
&X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
|
||||
)?;
|
||||
Self::check_decrypt_common(garage, key, obj_enc)
|
||||
}
|
||||
|
||||
fn check_decrypt_common<'a>(
|
||||
garage: &Garage,
|
||||
key: Option<(Key<Aes256Gcm>, Md5Output)>,
|
||||
obj_enc: &'a ObjectVersionEncryption,
|
||||
) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> {
|
||||
match (key, &obj_enc) {
|
||||
(
|
||||
Some((client_key, client_key_md5)),
|
||||
ObjectVersionEncryption::SseC { inner, compressed },
|
||||
) => {
|
||||
let enc = Self::SseC {
|
||||
client_key,
|
||||
client_key_md5,
|
||||
compression_level: if *compressed {
|
||||
Some(garage.config.compression_level.unwrap_or(1))
|
||||
} else {
|
||||
None
|
||||
},
|
||||
};
|
||||
let plaintext = enc.decrypt_blob(&inner)?;
|
||||
let inner = ObjectVersionMetaInner::decode(&plaintext)
|
||||
.ok_or_internal_error("Could not decode encrypted metadata")?;
|
||||
Ok((enc, Cow::Owned(inner)))
|
||||
}
|
||||
(None, ObjectVersionEncryption::Plaintext { inner }) => {
|
||||
Ok((Self::Plaintext, Cow::Borrowed(inner)))
|
||||
}
|
||||
(_, ObjectVersionEncryption::SseC { .. }) => {
|
||||
Err(Error::bad_request("Object is encrypted"))
|
||||
}
|
||||
(Some(_), _) => {
|
||||
// TODO: should this be an OK scenario?
|
||||
Err(Error::bad_request("Trying to decrypt a plaintext object"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encrypt_meta(
|
||||
&self,
|
||||
meta: ObjectVersionMetaInner,
|
||||
) -> Result<ObjectVersionEncryption, Error> {
|
||||
match self {
|
||||
Self::SseC {
|
||||
compression_level, ..
|
||||
} => {
|
||||
let plaintext = meta.encode().map_err(GarageError::from)?;
|
||||
let ciphertext = self.encrypt_blob(&plaintext)?;
|
||||
Ok(ObjectVersionEncryption::SseC {
|
||||
inner: ciphertext.into_owned(),
|
||||
compressed: compression_level.is_some(),
|
||||
})
|
||||
}
|
||||
Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { inner: meta }),
|
||||
}
|
||||
}
|
||||
|
||||
// ---- generating object Etag values ----
|
||||
pub fn etag_from_md5(&self, md5sum: &Option<Md5Checksum>) -> String {
|
||||
match self {
|
||||
Self::Plaintext => md5sum
|
||||
.map(|x| hex::encode(&x[..]))
|
||||
.expect("md5 digest should have been computed"),
|
||||
Self::SseC { .. } => {
|
||||
// AWS specifies that for encrypted objects, the Etag is not
|
||||
// the md5sum of the data, but doesn't say what it is.
|
||||
// So we just put some random bytes.
|
||||
let mut random = [0u8; 16];
|
||||
OsRng.fill_bytes(&mut random);
|
||||
hex::encode(&random)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- generic function for encrypting / decrypting blobs ----
|
||||
// Prepends a randomly-generated nonce to the encrypted value.
|
||||
// This is used for encrypting object metadata and inlined data for small objects.
|
||||
// This does not compress anything.
|
||||
|
||||
pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result<Cow<'a, [u8]>, Error> {
|
||||
match self {
|
||||
Self::SseC { client_key, .. } => {
|
||||
let cipher = Aes256Gcm::new(&client_key);
|
||||
let nonce = Aes256Gcm::generate_nonce(&mut OsRng);
|
||||
let ciphertext = cipher
|
||||
.encrypt(&nonce, blob)
|
||||
.ok_or_internal_error("Encryption failed")?;
|
||||
Ok(Cow::Owned([nonce.to_vec(), ciphertext].concat()))
|
||||
}
|
||||
Self::Plaintext => Ok(Cow::Borrowed(blob)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decrypt_blob<'a>(&self, blob: &'a [u8]) -> Result<Cow<'a, [u8]>, Error> {
|
||||
match self {
|
||||
Self::SseC { client_key, .. } => {
|
||||
let cipher = Aes256Gcm::new(&client_key);
|
||||
let nonce_size = <Aes256Gcm as AeadCore>::NonceSize::to_usize();
|
||||
let nonce = Nonce::from_slice(
|
||||
blob.get(..nonce_size)
|
||||
.ok_or_internal_error("invalid encrypted data")?,
|
||||
);
|
||||
let plaintext = cipher
|
||||
.decrypt(nonce, &blob[nonce_size..])
|
||||
.ok_or_bad_request(
|
||||
"Invalid encryption key, could not decrypt object metadata.",
|
||||
)?;
|
||||
Ok(Cow::Owned(plaintext))
|
||||
}
|
||||
Self::Plaintext => Ok(Cow::Borrowed(blob)),
|
||||
}
|
||||
}
|
||||
|
||||
// ---- function for encrypting / decrypting byte streams ----
|
||||
|
||||
/// Get a data block from the storage node, and decrypt+decompress it
|
||||
/// if necessary. If object is plaintext, just get it without any processing.
|
||||
pub async fn get_block(
|
||||
&self,
|
||||
garage: &Garage,
|
||||
hash: &Hash,
|
||||
order: Option<OrderTag>,
|
||||
) -> Result<ByteStream, GarageError> {
|
||||
let raw_block = garage
|
||||
.block_manager
|
||||
.rpc_get_block_streaming(hash, order)
|
||||
.await?;
|
||||
Ok(self.decrypt_block_stream(raw_block))
|
||||
}
|
||||
|
||||
pub fn decrypt_block_stream(&self, stream: ByteStream) -> ByteStream {
|
||||
match self {
|
||||
Self::Plaintext => stream,
|
||||
Self::SseC {
|
||||
client_key,
|
||||
compression_level,
|
||||
..
|
||||
} => {
|
||||
let plaintext = DecryptStream::new(stream, *client_key);
|
||||
if compression_level.is_some() {
|
||||
let reader = stream_asyncread(Box::pin(plaintext));
|
||||
let reader = BufReader::new(reader);
|
||||
let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader);
|
||||
Box::pin(tokio_util::io::ReaderStream::new(reader))
|
||||
} else {
|
||||
Box::pin(plaintext)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Encrypt a data block if encryption is set, for use before
|
||||
/// putting the data blocks into storage
|
||||
pub fn encrypt_block(&self, block: Bytes) -> Result<Bytes, Error> {
|
||||
match self {
|
||||
Self::Plaintext => Ok(block),
|
||||
Self::SseC {
|
||||
client_key,
|
||||
compression_level,
|
||||
..
|
||||
} => {
|
||||
let block = if let Some(level) = compression_level {
|
||||
Cow::Owned(
|
||||
garage_block::zstd_encode(block.as_ref(), *level)
|
||||
.ok_or_internal_error("failed to compress data block")?,
|
||||
)
|
||||
} else {
|
||||
Cow::Borrowed(block.as_ref())
|
||||
};
|
||||
|
||||
let mut ret = Vec::with_capacity(block.len() + 32 + block.len() / 64);
|
||||
|
||||
let mut nonce: Nonce<StreamNonceSize> = Default::default();
|
||||
OsRng.fill_bytes(&mut nonce);
|
||||
ret.extend_from_slice(nonce.as_slice());
|
||||
|
||||
let mut cipher = EncryptorLE31::<Aes256Gcm>::new(&client_key, &nonce);
|
||||
let mut iter = block.chunks(STREAM_ENC_PLAIN_CHUNK_SIZE).peekable();
|
||||
|
||||
if iter.peek().is_none() {
|
||||
// Empty stream: we encrypt an empty last chunk
|
||||
let chunk_enc = cipher
|
||||
.encrypt_last(&[][..])
|
||||
.ok_or_internal_error("failed to encrypt chunk")?;
|
||||
ret.extend_from_slice(&chunk_enc);
|
||||
} else {
|
||||
loop {
|
||||
let chunk = iter.next().unwrap();
|
||||
if iter.peek().is_some() {
|
||||
let chunk_enc = cipher
|
||||
.encrypt_next(chunk)
|
||||
.ok_or_internal_error("failed to encrypt chunk")?;
|
||||
assert_eq!(chunk.len(), STREAM_ENC_PLAIN_CHUNK_SIZE);
|
||||
assert_eq!(chunk_enc.len(), STREAM_ENC_CYPER_CHUNK_SIZE);
|
||||
ret.extend_from_slice(&chunk_enc);
|
||||
} else {
|
||||
// use encrypt_last for the last chunk
|
||||
let chunk_enc = cipher
|
||||
.encrypt_last(chunk)
|
||||
.ok_or_internal_error("failed to encrypt chunk")?;
|
||||
ret.extend_from_slice(&chunk_enc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ret.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_request_headers(
|
||||
headers: &HeaderMap,
|
||||
alg_header: &HeaderName,
|
||||
key_header: &HeaderName,
|
||||
md5_header: &HeaderName,
|
||||
) -> Result<Option<(Key<Aes256Gcm>, Md5Output)>, Error> {
|
||||
let alg = headers.get(alg_header).map(HeaderValue::as_bytes);
|
||||
let key = headers.get(key_header).map(HeaderValue::as_bytes);
|
||||
let md5 = headers.get(md5_header).map(HeaderValue::as_bytes);
|
||||
|
||||
match alg {
|
||||
Some(CUSTOMER_ALGORITHM_AES256) => {
|
||||
use md5::{Digest, Md5};
|
||||
|
||||
let key_b64 =
|
||||
key.ok_or_bad_request("Missing server-side-encryption-customer-key header")?;
|
||||
let key_bytes: [u8; 32] = BASE64_STANDARD
|
||||
.decode(&key_b64)
|
||||
.ok_or_bad_request(
|
||||
"Invalid server-side-encryption-customer-key header: invalid base64",
|
||||
)?
|
||||
.try_into()
|
||||
.ok()
|
||||
.ok_or_bad_request(
|
||||
"Invalid server-side-encryption-customer-key header: invalid length",
|
||||
)?;
|
||||
|
||||
let md5_b64 =
|
||||
md5.ok_or_bad_request("Missing server-side-encryption-customer-key-md5 header")?;
|
||||
let md5_bytes = BASE64_STANDARD.decode(&md5_b64).ok_or_bad_request(
|
||||
"Invalid server-side-encryption-customer-key-md5 header: invalid bass64",
|
||||
)?;
|
||||
|
||||
let mut hasher = Md5::new();
|
||||
hasher.update(&key_bytes[..]);
|
||||
let our_md5 = hasher.finalize();
|
||||
if our_md5.as_slice() != md5_bytes.as_slice() {
|
||||
return Err(Error::bad_request(
|
||||
"Server-side encryption client key MD5 checksum does not match",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(Some((key_bytes.into(), our_md5)))
|
||||
}
|
||||
Some(alg) => Err(Error::InvalidEncryptionAlgorithm(
|
||||
String::from_utf8_lossy(alg).into_owned(),
|
||||
)),
|
||||
None => {
|
||||
if key.is_some() || md5.is_some() {
|
||||
Err(Error::bad_request(
|
||||
"Unexpected server-side-encryption-customer-key{,-md5} header(s)",
|
||||
))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- encrypt & decrypt streams ----
|
||||
|
||||
#[pin_project::pin_project]
|
||||
struct DecryptStream {
|
||||
#[pin]
|
||||
stream: ByteStream,
|
||||
done_reading: bool,
|
||||
buf: BytesBuf,
|
||||
key: Key<Aes256Gcm>,
|
||||
state: DecryptStreamState,
|
||||
}
|
||||
|
||||
enum DecryptStreamState {
|
||||
Starting,
|
||||
Running(DecryptorLE31<Aes256Gcm>),
|
||||
Done,
|
||||
}
|
||||
|
||||
impl DecryptStream {
|
||||
fn new(stream: ByteStream, key: Key<Aes256Gcm>) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
done_reading: false,
|
||||
buf: BytesBuf::new(),
|
||||
key,
|
||||
state: DecryptStreamState::Starting,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for DecryptStream {
|
||||
type Item = Result<Bytes, std::io::Error>;
|
||||
|
||||
fn poll_next(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut task::Context<'_>,
|
||||
) -> task::Poll<Option<Self::Item>> {
|
||||
use std::task::Poll;
|
||||
|
||||
let mut this = self.project();
|
||||
|
||||
// The first bytes of the stream should contain the starting nonce.
|
||||
// If we don't have a Running state, it means that we haven't
|
||||
// yet read the nonce.
|
||||
while matches!(this.state, DecryptStreamState::Starting) {
|
||||
let nonce_size = StreamNonceSize::to_usize();
|
||||
if let Some(nonce) = this.buf.take_exact(nonce_size) {
|
||||
let nonce = Nonce::from_slice(nonce.as_ref());
|
||||
*this.state = DecryptStreamState::Running(DecryptorLE31::new(&this.key, nonce));
|
||||
break;
|
||||
}
|
||||
|
||||
match futures::ready!(this.stream.as_mut().poll_next(cx)) {
|
||||
Some(Ok(bytes)) => {
|
||||
this.buf.extend(bytes);
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
return Poll::Ready(Some(Err(e)));
|
||||
}
|
||||
None => {
|
||||
return Poll::Ready(Some(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"Decrypt: unexpected EOF, could not read nonce",
|
||||
))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Read at least one byte more than the encrypted chunk size
|
||||
// (if possible), so that we know if we are decrypting the
|
||||
// last chunk or not.
|
||||
while !*this.done_reading && this.buf.len() <= STREAM_ENC_CYPER_CHUNK_SIZE {
|
||||
match futures::ready!(this.stream.as_mut().poll_next(cx)) {
|
||||
Some(Ok(bytes)) => {
|
||||
this.buf.extend(bytes);
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
return Poll::Ready(Some(Err(e)));
|
||||
}
|
||||
None => {
|
||||
*this.done_reading = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if matches!(this.state, DecryptStreamState::Done) {
|
||||
if !this.buf.is_empty() {
|
||||
return Poll::Ready(Some(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"Decrypt: unexpected bytes after last encrypted chunk",
|
||||
))));
|
||||
}
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
|
||||
let res = if this.buf.len() > STREAM_ENC_CYPER_CHUNK_SIZE {
|
||||
// we have strictly more bytes than the encrypted chunk size,
|
||||
// so we know this is not the last
|
||||
let DecryptStreamState::Running(ref mut cipher) = this.state else {
|
||||
unreachable!()
|
||||
};
|
||||
let chunk = this.buf.take_exact(STREAM_ENC_CYPER_CHUNK_SIZE).unwrap();
|
||||
let chunk_dec = cipher.decrypt_next(chunk.as_ref());
|
||||
if let Ok(c) = &chunk_dec {
|
||||
assert_eq!(c.len(), STREAM_ENC_PLAIN_CHUNK_SIZE);
|
||||
}
|
||||
chunk_dec
|
||||
} else {
|
||||
// We have one encrypted chunk size or less, even though we tried
|
||||
// to read more, so this is the last chunk. Decrypt using the
|
||||
// appropriate decrypt_last() function that then destroys the cipher.
|
||||
let state = std::mem::replace(this.state, DecryptStreamState::Done);
|
||||
let DecryptStreamState::Running(cipher) = state else {
|
||||
unreachable!()
|
||||
};
|
||||
let chunk = this.buf.take_all();
|
||||
cipher.decrypt_last(chunk.as_ref())
|
||||
};
|
||||
|
||||
match res {
|
||||
Ok(bytes) if bytes.is_empty() => Poll::Ready(None),
|
||||
Ok(bytes) => Poll::Ready(Some(Ok(bytes.into()))),
|
||||
Err(_) => Poll::Ready(Some(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"Decryption failed",
|
||||
)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use futures::stream::StreamExt;
|
||||
use garage_net::stream::read_stream_to_end;
|
||||
|
||||
fn stream() -> ByteStream {
|
||||
Box::pin(
|
||||
futures::stream::iter(16usize..1024)
|
||||
.map(|i| Ok(Bytes::from(vec![(i % 256) as u8; (i * 37) % 1024]))),
|
||||
)
|
||||
}
|
||||
|
||||
async fn test_block_enc(compression_level: Option<i32>) {
|
||||
let enc = EncryptionParams::SseC {
|
||||
client_key: Aes256Gcm::generate_key(&mut OsRng),
|
||||
client_key_md5: Default::default(), // not needed
|
||||
compression_level,
|
||||
};
|
||||
|
||||
let block_plain = read_stream_to_end(stream()).await.unwrap().into_bytes();
|
||||
|
||||
let block_enc = enc.encrypt_block(block_plain.clone()).unwrap();
|
||||
|
||||
let block_dec =
|
||||
enc.decrypt_block_stream(Box::pin(futures::stream::once(async { Ok(block_enc) })));
|
||||
let block_dec = read_stream_to_end(block_dec).await.unwrap().into_bytes();
|
||||
|
||||
assert_eq!(block_plain, block_dec);
|
||||
assert!(block_dec.len() > 128000);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_encrypt_block() {
|
||||
test_block_enc(None).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_encrypt_block_compressed() {
|
||||
test_block_enc(Some(1)).await
|
||||
}
|
||||
}
|
|
@ -65,6 +65,14 @@ pub enum Error {
|
|||
#[error(display = "Invalid HTTP range: {:?}", _0)]
|
||||
InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)),
|
||||
|
||||
/// The client sent a range header with invalid value
|
||||
#[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)]
|
||||
InvalidEncryptionAlgorithm(String),
|
||||
|
||||
/// The client sent invalid XML data
|
||||
#[error(display = "Invalid digest: {}", _0)]
|
||||
InvalidDigest(String),
|
||||
|
||||
/// The client sent a request for an action not supported by garage
|
||||
#[error(display = "Unimplemented action: {}", _0)]
|
||||
NotImplemented(String),
|
||||
|
@ -125,7 +133,9 @@ impl Error {
|
|||
Error::NotImplemented(_) => "NotImplemented",
|
||||
Error::InvalidXml(_) => "MalformedXML",
|
||||
Error::InvalidRange(_) => "InvalidRange",
|
||||
Error::InvalidDigest(_) => "InvalidDigest",
|
||||
Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest",
|
||||
Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -143,6 +153,8 @@ impl ApiError for Error {
|
|||
| Error::InvalidPart
|
||||
| Error::InvalidPartOrder
|
||||
| Error::EntityTooSmall
|
||||
| Error::InvalidDigest(_)
|
||||
| Error::InvalidEncryptionAlgorithm(_)
|
||||
| Error::InvalidXml(_)
|
||||
| Error::InvalidUtf8Str(_)
|
||||
| Error::InvalidUtf8String(_) => StatusCode::BAD_REQUEST,
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
//! Function related to GET and HEAD requests
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::future;
|
||||
use futures::stream::{self, StreamExt};
|
||||
use futures::stream::{self, Stream, StreamExt};
|
||||
use http::header::{
|
||||
ACCEPT_RANGES, CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE,
|
||||
CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, EXPIRES, IF_MODIFIED_SINCE, IF_NONE_MATCH,
|
||||
|
@ -25,6 +27,8 @@ use garage_model::s3::version_table::*;
|
|||
|
||||
use crate::helpers::*;
|
||||
use crate::s3::api_server::ResBody;
|
||||
use crate::s3::checksum::{add_checksum_response_headers, X_AMZ_CHECKSUM_MODE};
|
||||
use crate::s3::encryption::EncryptionParams;
|
||||
use crate::s3::error::*;
|
||||
|
||||
const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count";
|
||||
|
@ -42,6 +46,9 @@ pub struct GetObjectOverrides {
|
|||
fn object_headers(
|
||||
version: &ObjectVersion,
|
||||
version_meta: &ObjectVersionMeta,
|
||||
meta_inner: &ObjectVersionMetaInner,
|
||||
encryption: EncryptionParams,
|
||||
checksum_mode: ChecksumMode,
|
||||
) -> http::response::Builder {
|
||||
debug!("Version meta: {:?}", version_meta);
|
||||
|
||||
|
@ -49,7 +56,6 @@ fn object_headers(
|
|||
let date_str = httpdate::fmt_http_date(date);
|
||||
|
||||
let mut resp = Response::builder()
|
||||
.header(CONTENT_TYPE, version_meta.headers.content_type.to_string())
|
||||
.header(LAST_MODIFIED, date_str)
|
||||
.header(ACCEPT_RANGES, "bytes".to_string());
|
||||
|
||||
|
@ -57,9 +63,30 @@ fn object_headers(
|
|||
resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag));
|
||||
}
|
||||
|
||||
for (k, v) in version_meta.headers.other.iter() {
|
||||
resp = resp.header(k, v.to_string());
|
||||
// When metadata is retrieved through the REST API, Amazon S3 combines headers that
|
||||
// have the same name (ignoring case) into a comma-delimited list.
|
||||
// See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html
|
||||
let mut headers_by_name = BTreeMap::new();
|
||||
for (name, value) in meta_inner.headers.iter() {
|
||||
match headers_by_name.get_mut(name) {
|
||||
None => {
|
||||
headers_by_name.insert(name, vec![value.as_str()]);
|
||||
}
|
||||
Some(headers) => {
|
||||
headers.push(value.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (name, values) in headers_by_name {
|
||||
resp = resp.header(name, values.join(","));
|
||||
}
|
||||
|
||||
if checksum_mode.enabled {
|
||||
resp = add_checksum_response_headers(&meta_inner.checksum, resp);
|
||||
}
|
||||
|
||||
encryption.add_response_headers(&mut resp);
|
||||
|
||||
resp
|
||||
}
|
||||
|
@ -175,17 +202,29 @@ pub async fn handle_head_without_ctx(
|
|||
return Ok(cached);
|
||||
}
|
||||
|
||||
let (encryption, headers) =
|
||||
EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?;
|
||||
|
||||
let checksum_mode = checksum_mode(&req);
|
||||
|
||||
if let Some(pn) = part_number {
|
||||
match version_data {
|
||||
ObjectVersionData::Inline(_, bytes) => {
|
||||
ObjectVersionData::Inline(_, _) => {
|
||||
if pn != 1 {
|
||||
return Err(Error::InvalidPart);
|
||||
}
|
||||
Ok(object_headers(object_version, version_meta)
|
||||
.header(CONTENT_LENGTH, format!("{}", bytes.len()))
|
||||
let bytes_len = version_meta.size;
|
||||
Ok(object_headers(
|
||||
object_version,
|
||||
version_meta,
|
||||
&headers,
|
||||
encryption,
|
||||
checksum_mode,
|
||||
)
|
||||
.header(CONTENT_LENGTH, format!("{}", bytes_len))
|
||||
.header(
|
||||
CONTENT_RANGE,
|
||||
format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()),
|
||||
format!("bytes 0-{}/{}", bytes_len - 1, bytes_len),
|
||||
)
|
||||
.header(X_AMZ_MP_PARTS_COUNT, "1")
|
||||
.status(StatusCode::PARTIAL_CONTENT)
|
||||
|
@ -201,7 +240,13 @@ pub async fn handle_head_without_ctx(
|
|||
let (part_offset, part_end) =
|
||||
calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?;
|
||||
|
||||
Ok(object_headers(object_version, version_meta)
|
||||
Ok(object_headers(
|
||||
object_version,
|
||||
version_meta,
|
||||
&headers,
|
||||
encryption,
|
||||
checksum_mode,
|
||||
)
|
||||
.header(CONTENT_LENGTH, format!("{}", part_end - part_offset))
|
||||
.header(
|
||||
CONTENT_RANGE,
|
||||
|
@ -219,7 +264,13 @@ pub async fn handle_head_without_ctx(
|
|||
_ => unreachable!(),
|
||||
}
|
||||
} else {
|
||||
Ok(object_headers(object_version, version_meta)
|
||||
Ok(object_headers(
|
||||
object_version,
|
||||
version_meta,
|
||||
&headers,
|
||||
encryption,
|
||||
checksum_mode,
|
||||
)
|
||||
.header(CONTENT_LENGTH, format!("{}", version_meta.size))
|
||||
.status(StatusCode::OK)
|
||||
.body(empty_body())?)
|
||||
|
@ -273,23 +324,55 @@ pub async fn handle_get_without_ctx(
|
|||
return Ok(cached);
|
||||
}
|
||||
|
||||
let (enc, headers) =
|
||||
EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?;
|
||||
|
||||
let checksum_mode = checksum_mode(&req);
|
||||
|
||||
match (part_number, parse_range_header(req, last_v_meta.size)?) {
|
||||
(Some(_), Some(_)) => Err(Error::bad_request(
|
||||
"Cannot specify both partNumber and Range header",
|
||||
)),
|
||||
(Some(pn), None) => handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await,
|
||||
(Some(pn), None) => {
|
||||
handle_get_part(
|
||||
garage,
|
||||
last_v,
|
||||
last_v_data,
|
||||
last_v_meta,
|
||||
enc,
|
||||
&headers,
|
||||
pn,
|
||||
checksum_mode,
|
||||
)
|
||||
.await
|
||||
}
|
||||
(None, Some(range)) => {
|
||||
handle_get_range(
|
||||
garage,
|
||||
last_v,
|
||||
last_v_data,
|
||||
last_v_meta,
|
||||
enc,
|
||||
&headers,
|
||||
range.start,
|
||||
range.start + range.length,
|
||||
checksum_mode,
|
||||
)
|
||||
.await
|
||||
}
|
||||
(None, None) => {
|
||||
handle_get_full(
|
||||
garage,
|
||||
last_v,
|
||||
last_v_data,
|
||||
last_v_meta,
|
||||
enc,
|
||||
&headers,
|
||||
overrides,
|
||||
checksum_mode,
|
||||
)
|
||||
.await
|
||||
}
|
||||
(None, None) => handle_get_full(garage, last_v, last_v_data, last_v_meta, overrides).await,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -298,17 +381,43 @@ async fn handle_get_full(
|
|||
version: &ObjectVersion,
|
||||
version_data: &ObjectVersionData,
|
||||
version_meta: &ObjectVersionMeta,
|
||||
encryption: EncryptionParams,
|
||||
meta_inner: &ObjectVersionMetaInner,
|
||||
overrides: GetObjectOverrides,
|
||||
checksum_mode: ChecksumMode,
|
||||
) -> Result<Response<ResBody>, Error> {
|
||||
let mut resp_builder = object_headers(version, version_meta)
|
||||
let mut resp_builder = object_headers(
|
||||
version,
|
||||
version_meta,
|
||||
&meta_inner,
|
||||
encryption,
|
||||
checksum_mode,
|
||||
)
|
||||
.header(CONTENT_LENGTH, format!("{}", version_meta.size))
|
||||
.status(StatusCode::OK);
|
||||
getobject_override_headers(overrides, &mut resp_builder)?;
|
||||
|
||||
let stream = full_object_byte_stream(garage, version, version_data, encryption);
|
||||
|
||||
Ok(resp_builder.body(response_body_from_stream(stream))?)
|
||||
}
|
||||
|
||||
pub fn full_object_byte_stream(
|
||||
garage: Arc<Garage>,
|
||||
version: &ObjectVersion,
|
||||
version_data: &ObjectVersionData,
|
||||
encryption: EncryptionParams,
|
||||
) -> ByteStream {
|
||||
match &version_data {
|
||||
ObjectVersionData::DeleteMarker => unreachable!(),
|
||||
ObjectVersionData::Inline(_, bytes) => {
|
||||
Ok(resp_builder.body(bytes_body(bytes.to_vec().into()))?)
|
||||
let bytes = bytes.to_vec();
|
||||
Box::pin(futures::stream::once(async move {
|
||||
encryption
|
||||
.decrypt_blob(&bytes)
|
||||
.map(|x| Bytes::from(x.to_vec()))
|
||||
.map_err(std_error_from_read_error)
|
||||
}))
|
||||
}
|
||||
ObjectVersionData::FirstBlock(_, first_block_hash) => {
|
||||
let (tx, rx) = mpsc::channel::<ByteStream>(2);
|
||||
|
@ -324,19 +433,18 @@ async fn handle_get_full(
|
|||
garage2.version_table.get(&version_uuid, &EmptyKey).await
|
||||
});
|
||||
|
||||
let stream_block_0 = garage
|
||||
.block_manager
|
||||
.rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0)))
|
||||
let stream_block_0 = encryption
|
||||
.get_block(&garage, &first_block_hash, Some(order_stream.order(0)))
|
||||
.await?;
|
||||
|
||||
tx.send(stream_block_0)
|
||||
.await
|
||||
.ok_or_message("channel closed")?;
|
||||
|
||||
let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?;
|
||||
for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) {
|
||||
let stream_block_i = garage
|
||||
.block_manager
|
||||
.rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64)))
|
||||
let stream_block_i = encryption
|
||||
.get_block(&garage, &vb.hash, Some(order_stream.order(i as u64)))
|
||||
.await?;
|
||||
tx.send(stream_block_i)
|
||||
.await
|
||||
|
@ -354,8 +462,7 @@ async fn handle_get_full(
|
|||
}
|
||||
});
|
||||
|
||||
let body = response_body_from_block_stream(rx);
|
||||
Ok(resp_builder.body(body)?)
|
||||
Box::pin(tokio_stream::wrappers::ReceiverStream::new(rx).flatten())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -365,13 +472,16 @@ async fn handle_get_range(
|
|||
version: &ObjectVersion,
|
||||
version_data: &ObjectVersionData,
|
||||
version_meta: &ObjectVersionMeta,
|
||||
encryption: EncryptionParams,
|
||||
meta_inner: &ObjectVersionMetaInner,
|
||||
begin: u64,
|
||||
end: u64,
|
||||
checksum_mode: ChecksumMode,
|
||||
) -> Result<Response<ResBody>, Error> {
|
||||
// Here we do not use getobject_override_headers because we don't
|
||||
// want to add any overridden headers (those should not be added
|
||||
// when returning PARTIAL_CONTENT)
|
||||
let resp_builder = object_headers(version, version_meta)
|
||||
let resp_builder = object_headers(version, version_meta, meta_inner, encryption, checksum_mode)
|
||||
.header(CONTENT_LENGTH, format!("{}", end - begin))
|
||||
.header(
|
||||
CONTENT_RANGE,
|
||||
|
@ -382,6 +492,7 @@ async fn handle_get_range(
|
|||
match &version_data {
|
||||
ObjectVersionData::DeleteMarker => unreachable!(),
|
||||
ObjectVersionData::Inline(_meta, bytes) => {
|
||||
let bytes = encryption.decrypt_blob(&bytes)?;
|
||||
if end as usize <= bytes.len() {
|
||||
let body = bytes_body(bytes[begin as usize..end as usize].to_vec().into());
|
||||
Ok(resp_builder.body(body)?)
|
||||
|
@ -398,7 +509,8 @@ async fn handle_get_range(
|
|||
.await?
|
||||
.ok_or(Error::NoSuchKey)?;
|
||||
|
||||
let body = body_from_blocks_range(garage, version.blocks.items(), begin, end);
|
||||
let body =
|
||||
body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end);
|
||||
Ok(resp_builder.body(body)?)
|
||||
}
|
||||
}
|
||||
|
@ -409,17 +521,28 @@ async fn handle_get_part(
|
|||
object_version: &ObjectVersion,
|
||||
version_data: &ObjectVersionData,
|
||||
version_meta: &ObjectVersionMeta,
|
||||
encryption: EncryptionParams,
|
||||
meta_inner: &ObjectVersionMetaInner,
|
||||
part_number: u64,
|
||||
checksum_mode: ChecksumMode,
|
||||
) -> Result<Response<ResBody>, Error> {
|
||||
// Same as for get_range, no getobject_override_headers
|
||||
let resp_builder =
|
||||
object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT);
|
||||
let resp_builder = object_headers(
|
||||
object_version,
|
||||
version_meta,
|
||||
meta_inner,
|
||||
encryption,
|
||||
checksum_mode,
|
||||
)
|
||||
.status(StatusCode::PARTIAL_CONTENT);
|
||||
|
||||
match version_data {
|
||||
ObjectVersionData::Inline(_, bytes) => {
|
||||
if part_number != 1 {
|
||||
return Err(Error::InvalidPart);
|
||||
}
|
||||
let bytes = encryption.decrypt_blob(&bytes)?;
|
||||
assert_eq!(bytes.len() as u64, version_meta.size);
|
||||
Ok(resp_builder
|
||||
.header(CONTENT_LENGTH, format!("{}", bytes.len()))
|
||||
.header(
|
||||
|
@ -427,7 +550,7 @@ async fn handle_get_part(
|
|||
format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()),
|
||||
)
|
||||
.header(X_AMZ_MP_PARTS_COUNT, "1")
|
||||
.body(bytes_body(bytes.to_vec().into()))?)
|
||||
.body(bytes_body(bytes.into_owned().into()))?)
|
||||
}
|
||||
ObjectVersionData::FirstBlock(_, _) => {
|
||||
let version = garage
|
||||
|
@ -439,7 +562,8 @@ async fn handle_get_part(
|
|||
let (begin, end) =
|
||||
calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?;
|
||||
|
||||
let body = body_from_blocks_range(garage, version.blocks.items(), begin, end);
|
||||
let body =
|
||||
body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end);
|
||||
|
||||
Ok(resp_builder
|
||||
.header(CONTENT_LENGTH, format!("{}", end - begin))
|
||||
|
@ -492,8 +616,23 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> {
|
|||
None
|
||||
}
|
||||
|
||||
struct ChecksumMode {
|
||||
enabled: bool,
|
||||
}
|
||||
|
||||
fn checksum_mode(req: &Request<impl Body>) -> ChecksumMode {
|
||||
ChecksumMode {
|
||||
enabled: req
|
||||
.headers()
|
||||
.get(X_AMZ_CHECKSUM_MODE)
|
||||
.map(|x| x == "ENABLED")
|
||||
.unwrap_or(false),
|
||||
}
|
||||
}
|
||||
|
||||
fn body_from_blocks_range(
|
||||
garage: Arc<Garage>,
|
||||
encryption: EncryptionParams,
|
||||
all_blocks: &[(VersionBlockKey, VersionBlock)],
|
||||
begin: u64,
|
||||
end: u64,
|
||||
|
@ -523,12 +662,11 @@ fn body_from_blocks_range(
|
|||
|
||||
tokio::spawn(async move {
|
||||
match async {
|
||||
let garage = garage.clone();
|
||||
for (i, (block, block_offset)) in blocks.iter().enumerate() {
|
||||
let block_stream = garage
|
||||
.block_manager
|
||||
.rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64)))
|
||||
.await?
|
||||
let block_stream = encryption
|
||||
.get_block(&garage, &block.hash, Some(order_stream.order(i as u64)))
|
||||
.await?;
|
||||
let block_stream = block_stream
|
||||
.scan(*block_offset, move |chunk_offset, chunk| {
|
||||
let r = match chunk {
|
||||
Ok(chunk_bytes) => {
|
||||
|
@ -588,9 +726,15 @@ fn body_from_blocks_range(
|
|||
}
|
||||
|
||||
fn response_body_from_block_stream(rx: mpsc::Receiver<ByteStream>) -> ResBody {
|
||||
let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx)
|
||||
.flatten()
|
||||
.map(|x| {
|
||||
let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten();
|
||||
response_body_from_stream(body_stream)
|
||||
}
|
||||
|
||||
fn response_body_from_stream<S>(stream: S) -> ResBody
|
||||
where
|
||||
S: Stream<Item = Result<Bytes, std::io::Error>> + Send + Sync + 'static,
|
||||
{
|
||||
let body_stream = stream.map(|x| {
|
||||
x.map(hyper::body::Frame::data)
|
||||
.map_err(|e| Error::from(garage_util::error::Error::from(e)))
|
||||
});
|
||||
|
@ -598,9 +742,14 @@ fn response_body_from_block_stream(rx: mpsc::Receiver<ByteStream>) -> ResBody {
|
|||
}
|
||||
|
||||
fn error_stream_item<E: std::fmt::Display>(e: E) -> ByteStream {
|
||||
let err = std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Error while getting object data: {}", e),
|
||||
);
|
||||
Box::pin(stream::once(future::ready(Err(err))))
|
||||
Box::pin(stream::once(future::ready(Err(std_error_from_read_error(
|
||||
e,
|
||||
)))))
|
||||
}
|
||||
|
||||
fn std_error_from_read_error<E: std::fmt::Display>(e: E) -> std::io::Error {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Error while reading object data: {}", e),
|
||||
)
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ use std::collections::{BTreeMap, BTreeSet};
|
|||
use std::iter::{Iterator, Peekable};
|
||||
|
||||
use base64::prelude::*;
|
||||
use hyper::Response;
|
||||
use hyper::{Request, Response};
|
||||
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::Error as GarageError;
|
||||
|
@ -15,7 +15,8 @@ use garage_table::EnumerationOrder;
|
|||
|
||||
use crate::encoding::*;
|
||||
use crate::helpers::*;
|
||||
use crate::s3::api_server::ResBody;
|
||||
use crate::s3::api_server::{ReqBody, ResBody};
|
||||
use crate::s3::encryption::EncryptionParams;
|
||||
use crate::s3::error::*;
|
||||
use crate::s3::multipart as s3_multipart;
|
||||
use crate::s3::xml as s3_xml;
|
||||
|
@ -271,13 +272,21 @@ pub async fn handle_list_multipart_upload(
|
|||
|
||||
pub async fn handle_list_parts(
|
||||
ctx: ReqCtx,
|
||||
req: Request<ReqBody>,
|
||||
query: &ListPartsQuery,
|
||||
) -> Result<Response<ResBody>, Error> {
|
||||
debug!("ListParts {:?}", query);
|
||||
|
||||
let upload_id = s3_multipart::decode_upload_id(&query.upload_id)?;
|
||||
|
||||
let (_, _, mpu) = s3_multipart::get_upload(&ctx, &query.key, &upload_id).await?;
|
||||
let (_, object_version, mpu) = s3_multipart::get_upload(&ctx, &query.key, &upload_id).await?;
|
||||
|
||||
let object_encryption = match object_version.state {
|
||||
ObjectVersionState::Uploading { encryption, .. } => encryption,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let encryption_res =
|
||||
EncryptionParams::check_decrypt(&ctx.garage, req.headers(), &object_encryption);
|
||||
|
||||
let (info, next) = fetch_part_info(query, &mpu)?;
|
||||
|
||||
|
@ -296,11 +305,40 @@ pub async fn handle_list_parts(
|
|||
is_truncated: s3_xml::Value(format!("{}", next.is_some())),
|
||||
parts: info
|
||||
.iter()
|
||||
.map(|part| s3_xml::PartItem {
|
||||
.map(|part| {
|
||||
// hide checksum if object is encrypted and the decryption
|
||||
// keys are not provided
|
||||
let checksum = part.checksum.filter(|_| encryption_res.is_ok());
|
||||
s3_xml::PartItem {
|
||||
etag: s3_xml::Value(format!("\"{}\"", part.etag)),
|
||||
last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)),
|
||||
part_number: s3_xml::IntValue(part.part_number as i64),
|
||||
size: s3_xml::IntValue(part.size as i64),
|
||||
checksum_crc32: match &checksum {
|
||||
Some(ChecksumValue::Crc32(x)) => {
|
||||
Some(s3_xml::Value(BASE64_STANDARD.encode(&x)))
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
checksum_crc32c: match &checksum {
|
||||
Some(ChecksumValue::Crc32c(x)) => {
|
||||
Some(s3_xml::Value(BASE64_STANDARD.encode(&x)))
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
checksum_sha1: match &checksum {
|
||||
Some(ChecksumValue::Sha1(x)) => {
|
||||
Some(s3_xml::Value(BASE64_STANDARD.encode(&x)))
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
checksum_sha256: match &checksum {
|
||||
Some(ChecksumValue::Sha256(x)) => {
|
||||
Some(s3_xml::Value(BASE64_STANDARD.encode(&x)))
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
|
||||
|
@ -346,6 +384,7 @@ struct PartInfo<'a> {
|
|||
timestamp: u64,
|
||||
part_number: u64,
|
||||
size: u64,
|
||||
checksum: Option<ChecksumValue>,
|
||||
}
|
||||
|
||||
enum ExtractionResult {
|
||||
|
@ -486,6 +525,7 @@ fn fetch_part_info<'a>(
|
|||
timestamp: pk.timestamp,
|
||||
etag,
|
||||
size,
|
||||
checksum: p.checksum,
|
||||
};
|
||||
match parts.last_mut() {
|
||||
Some(lastpart) if lastpart.part_number == pk.part_number => {
|
||||
|
@ -944,11 +984,14 @@ mod tests {
|
|||
timestamp: TS,
|
||||
state: ObjectVersionState::Uploading {
|
||||
multipart: true,
|
||||
headers: ObjectVersionHeaders {
|
||||
content_type: "text/plain".to_string(),
|
||||
other: BTreeMap::<String, String>::new(),
|
||||
encryption: ObjectVersionEncryption::Plaintext {
|
||||
inner: ObjectVersionMetaInner {
|
||||
headers: vec![],
|
||||
checksum: None,
|
||||
},
|
||||
},
|
||||
checksum_algorithm: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1136,6 +1179,7 @@ mod tests {
|
|||
version: uuid,
|
||||
size: Some(3),
|
||||
etag: Some("etag1".into()),
|
||||
checksum: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
|
@ -1147,6 +1191,7 @@ mod tests {
|
|||
version: uuid,
|
||||
size: None,
|
||||
etag: None,
|
||||
checksum: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
|
@ -1158,6 +1203,7 @@ mod tests {
|
|||
version: uuid,
|
||||
size: Some(10),
|
||||
etag: Some("etag2".into()),
|
||||
checksum: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
|
@ -1169,6 +1215,7 @@ mod tests {
|
|||
version: uuid,
|
||||
size: Some(7),
|
||||
etag: Some("etag3".into()),
|
||||
checksum: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
|
@ -1180,6 +1227,7 @@ mod tests {
|
|||
version: uuid,
|
||||
size: Some(5),
|
||||
etag: Some("etag4".into()),
|
||||
checksum: None,
|
||||
},
|
||||
),
|
||||
];
|
||||
|
@ -1218,12 +1266,14 @@ mod tests {
|
|||
etag: "etag1",
|
||||
timestamp: TS,
|
||||
part_number: 1,
|
||||
size: 3
|
||||
size: 3,
|
||||
checksum: None,
|
||||
},
|
||||
PartInfo {
|
||||
etag: "etag2",
|
||||
timestamp: TS,
|
||||
part_number: 3,
|
||||
checksum: None,
|
||||
size: 10
|
||||
},
|
||||
]
|
||||
|
@ -1239,12 +1289,14 @@ mod tests {
|
|||
PartInfo {
|
||||
etag: "etag3",
|
||||
timestamp: TS,
|
||||
checksum: None,
|
||||
part_number: 5,
|
||||
size: 7
|
||||
},
|
||||
PartInfo {
|
||||
etag: "etag4",
|
||||
timestamp: TS,
|
||||
checksum: None,
|
||||
part_number: 8,
|
||||
size: 5
|
||||
},
|
||||
|
@ -1268,24 +1320,28 @@ mod tests {
|
|||
PartInfo {
|
||||
etag: "etag1",
|
||||
timestamp: TS,
|
||||
checksum: None,
|
||||
part_number: 1,
|
||||
size: 3
|
||||
},
|
||||
PartInfo {
|
||||
etag: "etag2",
|
||||
timestamp: TS,
|
||||
checksum: None,
|
||||
part_number: 3,
|
||||
size: 10
|
||||
},
|
||||
PartInfo {
|
||||
etag: "etag3",
|
||||
timestamp: TS,
|
||||
checksum: None,
|
||||
part_number: 5,
|
||||
size: 7
|
||||
},
|
||||
PartInfo {
|
||||
etag: "etag4",
|
||||
timestamp: TS,
|
||||
checksum: None,
|
||||
part_number: 8,
|
||||
size: 5
|
||||
},
|
||||
|
|
|
@ -13,5 +13,7 @@ mod post_object;
|
|||
mod put;
|
||||
mod website;
|
||||
|
||||
mod checksum;
|
||||
mod encryption;
|
||||
mod router;
|
||||
pub mod xml;
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::Arc;
|
||||
|
||||
use base64::prelude::*;
|
||||
use futures::prelude::*;
|
||||
use hyper::{Request, Response};
|
||||
use md5::{Digest as Md5Digest, Md5};
|
||||
|
||||
use garage_table::*;
|
||||
use garage_util::data::*;
|
||||
|
@ -16,6 +17,8 @@ use garage_model::s3::version_table::*;
|
|||
|
||||
use crate::helpers::*;
|
||||
use crate::s3::api_server::{ReqBody, ResBody};
|
||||
use crate::s3::checksum::*;
|
||||
use crate::s3::encryption::EncryptionParams;
|
||||
use crate::s3::error::*;
|
||||
use crate::s3::put::*;
|
||||
use crate::s3::xml as s3_xml;
|
||||
|
@ -40,6 +43,16 @@ pub async fn handle_create_multipart_upload(
|
|||
let timestamp = next_timestamp(existing_object.as_ref());
|
||||
|
||||
let headers = get_headers(req.headers())?;
|
||||
let meta = ObjectVersionMetaInner {
|
||||
headers,
|
||||
checksum: None,
|
||||
};
|
||||
|
||||
// Determine whether object should be encrypted, and if so the key
|
||||
let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?;
|
||||
let object_encryption = encryption.encrypt_meta(meta)?;
|
||||
|
||||
let checksum_algorithm = request_checksum_algorithm(req.headers())?;
|
||||
|
||||
// Create object in object table
|
||||
let object_version = ObjectVersion {
|
||||
|
@ -47,7 +60,8 @@ pub async fn handle_create_multipart_upload(
|
|||
timestamp,
|
||||
state: ObjectVersionState::Uploading {
|
||||
multipart: true,
|
||||
headers,
|
||||
encryption: object_encryption,
|
||||
checksum_algorithm,
|
||||
},
|
||||
};
|
||||
let object = Object::new(*bucket_id, key.to_string(), vec![object_version]);
|
||||
|
@ -68,7 +82,9 @@ pub async fn handle_create_multipart_upload(
|
|||
};
|
||||
let xml = s3_xml::to_xml_with_header(&result)?;
|
||||
|
||||
Ok(Response::new(string_body(xml)))
|
||||
let mut resp = Response::builder();
|
||||
encryption.add_response_headers(&mut resp);
|
||||
Ok(resp.body(string_body(xml))?)
|
||||
}
|
||||
|
||||
pub async fn handle_put_part(
|
||||
|
@ -83,20 +99,37 @@ pub async fn handle_put_part(
|
|||
|
||||
let upload_id = decode_upload_id(upload_id)?;
|
||||
|
||||
let content_md5 = match req.headers().get("content-md5") {
|
||||
let expected_checksums = ExpectedChecksums {
|
||||
md5: match req.headers().get("content-md5") {
|
||||
Some(x) => Some(x.to_str()?.to_string()),
|
||||
None => None,
|
||||
},
|
||||
sha256: content_sha256,
|
||||
extra: request_checksum_value(req.headers())?,
|
||||
};
|
||||
|
||||
// Read first chuck, and at the same time try to get object to see if it exists
|
||||
let key = key.to_string();
|
||||
|
||||
let stream = body_stream(req.into_body());
|
||||
let (req_head, req_body) = req.into_parts();
|
||||
let stream = body_stream(req_body);
|
||||
let mut chunker = StreamChunker::new(stream, garage.config.block_size);
|
||||
|
||||
let ((_, _, mut mpu), first_block) =
|
||||
let ((_, object_version, mut mpu), first_block) =
|
||||
futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?;
|
||||
|
||||
// Check encryption params
|
||||
let (object_encryption, checksum_algorithm) = match object_version.state {
|
||||
ObjectVersionState::Uploading {
|
||||
encryption,
|
||||
checksum_algorithm,
|
||||
..
|
||||
} => (encryption, checksum_algorithm),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let (encryption, _) =
|
||||
EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?;
|
||||
|
||||
// Check object is valid and part can be accepted
|
||||
let first_block = first_block.ok_or_bad_request("Empty body")?;
|
||||
|
||||
|
@ -122,7 +155,9 @@ pub async fn handle_put_part(
|
|||
mpu_part_key,
|
||||
MpuPart {
|
||||
version: version_uuid,
|
||||
// all these are filled in later, at the end of this function
|
||||
etag: None,
|
||||
checksum: None,
|
||||
size: None,
|
||||
},
|
||||
);
|
||||
|
@ -136,24 +171,31 @@ pub async fn handle_put_part(
|
|||
garage.version_table.insert(&version).await?;
|
||||
|
||||
// Copy data to version
|
||||
let (total_size, data_md5sum, data_sha256sum, _) =
|
||||
read_and_put_blocks(&ctx, &version, part_number, first_block, &mut chunker).await?;
|
||||
let checksummer =
|
||||
Checksummer::init(&expected_checksums, !encryption.is_encrypted()).add(checksum_algorithm);
|
||||
let (total_size, checksums, _) = read_and_put_blocks(
|
||||
&ctx,
|
||||
&version,
|
||||
encryption,
|
||||
part_number,
|
||||
first_block,
|
||||
&mut chunker,
|
||||
checksummer,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Verify that checksums map
|
||||
ensure_checksum_matches(
|
||||
data_md5sum.as_slice(),
|
||||
data_sha256sum,
|
||||
content_md5.as_deref(),
|
||||
content_sha256,
|
||||
)?;
|
||||
checksums.verify(&expected_checksums)?;
|
||||
|
||||
// Store part etag in version
|
||||
let data_md5sum_hex = hex::encode(data_md5sum);
|
||||
let etag = encryption.etag_from_md5(&checksums.md5);
|
||||
|
||||
mpu.parts.put(
|
||||
mpu_part_key,
|
||||
MpuPart {
|
||||
version: version_uuid,
|
||||
etag: Some(data_md5sum_hex.clone()),
|
||||
etag: Some(etag.clone()),
|
||||
checksum: checksums.extract(checksum_algorithm),
|
||||
size: Some(total_size),
|
||||
},
|
||||
);
|
||||
|
@ -163,11 +205,10 @@ pub async fn handle_put_part(
|
|||
// We won't have to clean up on drop.
|
||||
interrupted_cleanup.cancel();
|
||||
|
||||
let response = Response::builder()
|
||||
.header("ETag", format!("\"{}\"", data_md5sum_hex))
|
||||
.body(empty_body())
|
||||
.unwrap();
|
||||
Ok(response)
|
||||
let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag));
|
||||
encryption.add_response_headers(&mut resp);
|
||||
let resp = add_checksum_response_headers(&expected_checksums.extra, resp);
|
||||
Ok(resp.body(empty_body())?)
|
||||
}
|
||||
|
||||
struct InterruptedCleanup(Option<InterruptedCleanupInner>);
|
||||
|
@ -214,10 +255,11 @@ pub async fn handle_complete_multipart_upload(
|
|||
bucket_name,
|
||||
..
|
||||
} = &ctx;
|
||||
let (req_head, req_body) = req.into_parts();
|
||||
|
||||
let body = http_body_util::BodyExt::collect(req.into_body())
|
||||
.await?
|
||||
.to_bytes();
|
||||
let expected_checksum = request_checksum_value(&req_head.headers)?;
|
||||
|
||||
let body = http_body_util::BodyExt::collect(req_body).await?.to_bytes();
|
||||
|
||||
if let Some(content_sha256) = content_sha256 {
|
||||
verify_signed_content(content_sha256, &body[..])?;
|
||||
|
@ -241,8 +283,12 @@ pub async fn handle_complete_multipart_upload(
|
|||
return Err(Error::bad_request("No data was uploaded"));
|
||||
}
|
||||
|
||||
let headers = match object_version.state {
|
||||
ObjectVersionState::Uploading { headers, .. } => headers,
|
||||
let (object_encryption, checksum_algorithm) = match object_version.state {
|
||||
ObjectVersionState::Uploading {
|
||||
encryption,
|
||||
checksum_algorithm,
|
||||
..
|
||||
} => (encryption, checksum_algorithm),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
|
@ -270,6 +316,13 @@ pub async fn handle_complete_multipart_upload(
|
|||
for req_part in body_list_of_parts.iter() {
|
||||
match have_parts.get(&req_part.part_number) {
|
||||
Some(part) if part.etag.as_ref() == Some(&req_part.etag) && part.size.is_some() => {
|
||||
// alternative version: if req_part.checksum.is_some() && part.checksum != req_part.checksum {
|
||||
if part.checksum != req_part.checksum {
|
||||
return Err(Error::InvalidDigest(format!(
|
||||
"Invalid checksum for part {}: in request = {:?}, uploaded part = {:?}",
|
||||
req_part.part_number, req_part.checksum, part.checksum
|
||||
)));
|
||||
}
|
||||
parts.push(*part)
|
||||
}
|
||||
_ => return Err(Error::InvalidPart),
|
||||
|
@ -317,18 +370,23 @@ pub async fn handle_complete_multipart_upload(
|
|||
});
|
||||
garage.block_ref_table.insert_many(block_refs).await?;
|
||||
|
||||
// Calculate etag of final object
|
||||
// Calculate checksum and etag of final object
|
||||
// To understand how etags are calculated, read more here:
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html
|
||||
// https://teppen.io/2018/06/23/aws_s3_etags/
|
||||
let mut etag_md5_hasher = Md5::new();
|
||||
let mut checksummer = MultipartChecksummer::init(checksum_algorithm);
|
||||
for part in parts.iter() {
|
||||
etag_md5_hasher.update(part.etag.as_ref().unwrap().as_bytes());
|
||||
checksummer.update(part.etag.as_ref().unwrap(), part.checksum)?;
|
||||
}
|
||||
let etag = format!(
|
||||
"{}-{}",
|
||||
hex::encode(etag_md5_hasher.finalize()),
|
||||
parts.len()
|
||||
);
|
||||
let (checksum_md5, checksum_extra) = checksummer.finalize();
|
||||
|
||||
if expected_checksum.is_some() && checksum_extra != expected_checksum {
|
||||
return Err(Error::InvalidDigest(
|
||||
"Failed to validate x-amz-checksum-*".into(),
|
||||
));
|
||||
}
|
||||
|
||||
let etag = format!("{}-{}", hex::encode(&checksum_md5[..]), parts.len());
|
||||
|
||||
// Calculate total size of final object
|
||||
let total_size = parts.iter().map(|x| x.size.unwrap()).sum();
|
||||
|
@ -341,10 +399,24 @@ pub async fn handle_complete_multipart_upload(
|
|||
return Err(e);
|
||||
}
|
||||
|
||||
// If there is a checksum algorithm, update metadata with checksum
|
||||
let object_encryption = match checksum_algorithm {
|
||||
None => object_encryption,
|
||||
Some(_) => {
|
||||
let (encryption, meta) =
|
||||
EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?;
|
||||
let new_meta = ObjectVersionMetaInner {
|
||||
headers: meta.into_owned().headers,
|
||||
checksum: checksum_extra,
|
||||
};
|
||||
encryption.encrypt_meta(new_meta)?
|
||||
}
|
||||
};
|
||||
|
||||
// Write final object version
|
||||
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
|
||||
ObjectVersionMeta {
|
||||
headers,
|
||||
encryption: object_encryption,
|
||||
size: total_size,
|
||||
etag: etag.clone(),
|
||||
},
|
||||
|
@ -361,10 +433,28 @@ pub async fn handle_complete_multipart_upload(
|
|||
bucket: s3_xml::Value(bucket_name.to_string()),
|
||||
key: s3_xml::Value(key),
|
||||
etag: s3_xml::Value(format!("\"{}\"", etag)),
|
||||
checksum_crc32: match &checksum_extra {
|
||||
Some(ChecksumValue::Crc32(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))),
|
||||
_ => None,
|
||||
},
|
||||
checksum_crc32c: match &checksum_extra {
|
||||
Some(ChecksumValue::Crc32c(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))),
|
||||
_ => None,
|
||||
},
|
||||
checksum_sha1: match &checksum_extra {
|
||||
Some(ChecksumValue::Sha1(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))),
|
||||
_ => None,
|
||||
},
|
||||
checksum_sha256: match &checksum_extra {
|
||||
Some(ChecksumValue::Sha256(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))),
|
||||
_ => None,
|
||||
},
|
||||
};
|
||||
let xml = s3_xml::to_xml_with_header(&result)?;
|
||||
|
||||
Ok(Response::new(string_body(xml)))
|
||||
let resp = Response::builder();
|
||||
let resp = add_checksum_response_headers(&expected_checksum, resp);
|
||||
Ok(resp.body(string_body(xml))?)
|
||||
}
|
||||
|
||||
pub async fn handle_abort_multipart_upload(
|
||||
|
@ -433,6 +523,7 @@ pub fn decode_upload_id(id: &str) -> Result<Uuid, Error> {
|
|||
struct CompleteMultipartUploadPart {
|
||||
etag: String,
|
||||
part_number: u64,
|
||||
checksum: Option<ChecksumValue>,
|
||||
}
|
||||
|
||||
fn parse_complete_multipart_upload_body(
|
||||
|
@ -458,9 +549,41 @@ fn parse_complete_multipart_upload_body(
|
|||
.children()
|
||||
.find(|e| e.has_tag_name("PartNumber"))?
|
||||
.text()?;
|
||||
let checksum = if let Some(crc32) =
|
||||
item.children().find(|e| e.has_tag_name("ChecksumCRC32"))
|
||||
{
|
||||
Some(ChecksumValue::Crc32(
|
||||
BASE64_STANDARD.decode(crc32.text()?).ok()?[..]
|
||||
.try_into()
|
||||
.ok()?,
|
||||
))
|
||||
} else if let Some(crc32c) = item.children().find(|e| e.has_tag_name("ChecksumCRC32C"))
|
||||
{
|
||||
Some(ChecksumValue::Crc32c(
|
||||
BASE64_STANDARD.decode(crc32c.text()?).ok()?[..]
|
||||
.try_into()
|
||||
.ok()?,
|
||||
))
|
||||
} else if let Some(sha1) = item.children().find(|e| e.has_tag_name("ChecksumSHA1")) {
|
||||
Some(ChecksumValue::Sha1(
|
||||
BASE64_STANDARD.decode(sha1.text()?).ok()?[..]
|
||||
.try_into()
|
||||
.ok()?,
|
||||
))
|
||||
} else if let Some(sha256) = item.children().find(|e| e.has_tag_name("ChecksumSHA256"))
|
||||
{
|
||||
Some(ChecksumValue::Sha256(
|
||||
BASE64_STANDARD.decode(sha256.text()?).ok()?[..]
|
||||
.try_into()
|
||||
.ok()?,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
parts.push(CompleteMultipartUploadPart {
|
||||
etag: etag.trim_matches('"').to_string(),
|
||||
part_number: part_number.parse().ok()?,
|
||||
checksum,
|
||||
});
|
||||
} else {
|
||||
return None;
|
||||
|
|
|
@ -14,12 +14,15 @@ use multer::{Constraints, Multipart, SizeLimit};
|
|||
use serde::Deserialize;
|
||||
|
||||
use garage_model::garage::Garage;
|
||||
use garage_model::s3::object_table::*;
|
||||
|
||||
use crate::helpers::*;
|
||||
use crate::s3::api_server::ResBody;
|
||||
use crate::s3::checksum::*;
|
||||
use crate::s3::cors::*;
|
||||
use crate::s3::encryption::EncryptionParams;
|
||||
use crate::s3::error::*;
|
||||
use crate::s3::put::{get_headers, save_stream};
|
||||
use crate::s3::put::{get_headers, save_stream, ChecksumMode};
|
||||
use crate::s3::xml as s3_xml;
|
||||
use crate::signature::payload::{verify_v4, Authorization};
|
||||
|
||||
|
@ -48,13 +51,17 @@ pub async fn handle_post_object(
|
|||
let mut multipart = Multipart::with_constraints(stream, boundary, constraints);
|
||||
|
||||
let mut params = HeaderMap::new();
|
||||
let field = loop {
|
||||
let file_field = loop {
|
||||
let field = if let Some(field) = multipart.next_field().await? {
|
||||
field
|
||||
} else {
|
||||
return Err(Error::bad_request("Request did not contain a file"));
|
||||
};
|
||||
let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) {
|
||||
let name: HeaderName = if let Some(Ok(name)) = field
|
||||
.name()
|
||||
.map(str::to_ascii_lowercase)
|
||||
.map(TryInto::try_into)
|
||||
{
|
||||
name
|
||||
} else {
|
||||
continue;
|
||||
|
@ -96,7 +103,7 @@ pub async fn handle_post_object(
|
|||
|
||||
let key = if key.contains("${filename}") {
|
||||
// if no filename is provided, don't replace. This matches the behavior of AWS.
|
||||
if let Some(filename) = field.file_name() {
|
||||
if let Some(filename) = file_field.file_name() {
|
||||
key.replace("${filename}", filename)
|
||||
} else {
|
||||
key.to_owned()
|
||||
|
@ -143,9 +150,8 @@ pub async fn handle_post_object(
|
|||
let mut conditions = decoded_policy.into_conditions()?;
|
||||
|
||||
for (param_key, value) in params.iter() {
|
||||
let mut param_key = param_key.to_string();
|
||||
param_key.make_ascii_lowercase();
|
||||
match param_key.as_str() {
|
||||
let param_key = param_key.as_str();
|
||||
match param_key {
|
||||
"policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields
|
||||
"content-type" => {
|
||||
let conds = conditions.params.remove("content-type").ok_or_else(|| {
|
||||
|
@ -190,7 +196,7 @@ pub async fn handle_post_object(
|
|||
// how aws seems to behave.
|
||||
continue;
|
||||
}
|
||||
let conds = conditions.params.remove(¶m_key).ok_or_else(|| {
|
||||
let conds = conditions.params.remove(param_key).ok_or_else(|| {
|
||||
Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
|
||||
})?;
|
||||
for cond in conds {
|
||||
|
@ -218,8 +224,24 @@ pub async fn handle_post_object(
|
|||
|
||||
let headers = get_headers(¶ms)?;
|
||||
|
||||
let stream = field.map(|r| r.map_err(Into::into));
|
||||
let expected_checksums = ExpectedChecksums {
|
||||
md5: params
|
||||
.get("content-md5")
|
||||
.map(HeaderValue::to_str)
|
||||
.transpose()?
|
||||
.map(str::to_string),
|
||||
sha256: None,
|
||||
extra: request_checksum_algorithm_value(¶ms)?,
|
||||
};
|
||||
|
||||
let meta = ObjectVersionMetaInner {
|
||||
headers,
|
||||
checksum: expected_checksums.extra,
|
||||
};
|
||||
|
||||
let encryption = EncryptionParams::new_from_headers(&garage, ¶ms)?;
|
||||
|
||||
let stream = file_field.map(|r| r.map_err(Into::into));
|
||||
let ctx = ReqCtx {
|
||||
garage,
|
||||
bucket_id,
|
||||
|
@ -228,17 +250,17 @@ pub async fn handle_post_object(
|
|||
api_key,
|
||||
};
|
||||
|
||||
let (_, md5) = save_stream(
|
||||
let res = save_stream(
|
||||
&ctx,
|
||||
headers,
|
||||
meta,
|
||||
encryption,
|
||||
StreamLimiter::new(stream, conditions.content_length),
|
||||
&key,
|
||||
None,
|
||||
None,
|
||||
ChecksumMode::Verify(&expected_checksums),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let etag = format!("\"{}\"", md5);
|
||||
let etag = format!("\"{}\"", res.etag);
|
||||
|
||||
let mut resp = if let Some(mut target) = params
|
||||
.get("success_action_redirect")
|
||||
|
@ -252,11 +274,12 @@ pub async fn handle_post_object(
|
|||
.append_pair("key", &key)
|
||||
.append_pair("etag", &etag);
|
||||
let target = target.to_string();
|
||||
Response::builder()
|
||||
let mut resp = Response::builder()
|
||||
.status(StatusCode::SEE_OTHER)
|
||||
.header(header::LOCATION, target.clone())
|
||||
.header(header::ETAG, etag)
|
||||
.body(string_body(target))?
|
||||
.header(header::ETAG, etag);
|
||||
encryption.add_response_headers(&mut resp);
|
||||
resp.body(string_body(target))?
|
||||
} else {
|
||||
let path = head
|
||||
.uri
|
||||
|
@ -283,9 +306,10 @@ pub async fn handle_post_object(
|
|||
.get("success_action_status")
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.unwrap_or("204");
|
||||
let builder = Response::builder()
|
||||
let mut builder = Response::builder()
|
||||
.header(header::LOCATION, location.clone())
|
||||
.header(header::ETAG, etag.clone());
|
||||
encryption.add_response_headers(&mut builder);
|
||||
match action {
|
||||
"200" => builder.status(StatusCode::OK).body(empty_body())?,
|
||||
"201" => {
|
||||
|
|
|
@ -1,12 +1,9 @@
|
|||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use base64::prelude::*;
|
||||
use futures::prelude::*;
|
||||
use futures::stream::FuturesOrdered;
|
||||
use futures::try_join;
|
||||
use md5::{digest::generic_array::*, Digest as Md5Digest, Md5};
|
||||
use sha2::Sha256;
|
||||
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
|
@ -22,7 +19,6 @@ use opentelemetry::{
|
|||
use garage_net::bytes_buf::BytesBuf;
|
||||
use garage_rpc::rpc_helper::OrderTag;
|
||||
use garage_table::*;
|
||||
use garage_util::async_hash::*;
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::Error as GarageError;
|
||||
use garage_util::time::*;
|
||||
|
@ -36,10 +32,24 @@ use garage_model::s3::version_table::*;
|
|||
|
||||
use crate::helpers::*;
|
||||
use crate::s3::api_server::{ReqBody, ResBody};
|
||||
use crate::s3::checksum::*;
|
||||
use crate::s3::encryption::EncryptionParams;
|
||||
use crate::s3::error::*;
|
||||
|
||||
const PUT_BLOCKS_MAX_PARALLEL: usize = 3;
|
||||
|
||||
pub(crate) struct SaveStreamResult {
|
||||
pub(crate) version_uuid: Uuid,
|
||||
pub(crate) version_timestamp: u64,
|
||||
/// Etag WITHOUT THE QUOTES (just the hex value)
|
||||
pub(crate) etag: String,
|
||||
}
|
||||
|
||||
pub(crate) enum ChecksumMode<'a> {
|
||||
Verify(&'a ExpectedChecksums),
|
||||
Calculate(Option<ChecksumAlgorithm>),
|
||||
}
|
||||
|
||||
pub async fn handle_put(
|
||||
ctx: ReqCtx,
|
||||
req: Request<ReqBody>,
|
||||
|
@ -50,26 +60,51 @@ pub async fn handle_put(
|
|||
let headers = get_headers(req.headers())?;
|
||||
debug!("Object headers: {:?}", headers);
|
||||
|
||||
let content_md5 = match req.headers().get("content-md5") {
|
||||
let expected_checksums = ExpectedChecksums {
|
||||
md5: match req.headers().get("content-md5") {
|
||||
Some(x) => Some(x.to_str()?.to_string()),
|
||||
None => None,
|
||||
},
|
||||
sha256: content_sha256,
|
||||
extra: request_checksum_value(req.headers())?,
|
||||
};
|
||||
|
||||
let meta = ObjectVersionMetaInner {
|
||||
headers,
|
||||
checksum: expected_checksums.extra,
|
||||
};
|
||||
|
||||
// Determine whether object should be encrypted, and if so the key
|
||||
let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?;
|
||||
|
||||
let stream = body_stream(req.into_body());
|
||||
|
||||
save_stream(&ctx, headers, stream, key, content_md5, content_sha256)
|
||||
.await
|
||||
.map(|(uuid, md5)| put_response(uuid, md5))
|
||||
let res = save_stream(
|
||||
&ctx,
|
||||
meta,
|
||||
encryption,
|
||||
stream,
|
||||
key,
|
||||
ChecksumMode::Verify(&expected_checksums),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut resp = Response::builder()
|
||||
.header("x-amz-version-id", hex::encode(res.version_uuid))
|
||||
.header("ETag", format!("\"{}\"", res.etag));
|
||||
encryption.add_response_headers(&mut resp);
|
||||
let resp = add_checksum_response_headers(&expected_checksums.extra, resp);
|
||||
Ok(resp.body(empty_body())?)
|
||||
}
|
||||
|
||||
pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
|
||||
ctx: &ReqCtx,
|
||||
headers: ObjectVersionHeaders,
|
||||
mut meta: ObjectVersionMetaInner,
|
||||
encryption: EncryptionParams,
|
||||
body: S,
|
||||
key: &String,
|
||||
content_md5: Option<String>,
|
||||
content_sha256: Option<FixedBytes32>,
|
||||
) -> Result<(Uuid, String), Error> {
|
||||
checksum_mode: ChecksumMode<'_>,
|
||||
) -> Result<SaveStreamResult, Error> {
|
||||
let ReqCtx {
|
||||
garage, bucket_id, ..
|
||||
} = ctx;
|
||||
|
@ -86,43 +121,55 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
|
|||
let version_uuid = gen_uuid();
|
||||
let version_timestamp = next_timestamp(existing_object.as_ref());
|
||||
|
||||
let mut checksummer = match checksum_mode {
|
||||
ChecksumMode::Verify(expected) => Checksummer::init(expected, !encryption.is_encrypted()),
|
||||
ChecksumMode::Calculate(algo) => {
|
||||
Checksummer::init(&Default::default(), !encryption.is_encrypted()).add(algo)
|
||||
}
|
||||
};
|
||||
|
||||
// If body is small enough, store it directly in the object table
|
||||
// as "inline data". We can then return immediately.
|
||||
if first_block.len() < INLINE_THRESHOLD {
|
||||
let mut md5sum = Md5::new();
|
||||
md5sum.update(&first_block[..]);
|
||||
let data_md5sum = md5sum.finalize();
|
||||
let data_md5sum_hex = hex::encode(data_md5sum);
|
||||
checksummer.update(&first_block);
|
||||
let checksums = checksummer.finalize();
|
||||
|
||||
match checksum_mode {
|
||||
ChecksumMode::Verify(expected) => {
|
||||
checksums.verify(&expected)?;
|
||||
}
|
||||
ChecksumMode::Calculate(algo) => {
|
||||
meta.checksum = checksums.extract(algo);
|
||||
}
|
||||
};
|
||||
|
||||
let data_sha256sum = sha256sum(&first_block[..]);
|
||||
let size = first_block.len() as u64;
|
||||
|
||||
ensure_checksum_matches(
|
||||
data_md5sum.as_slice(),
|
||||
data_sha256sum,
|
||||
content_md5.as_deref(),
|
||||
content_sha256,
|
||||
)?;
|
||||
|
||||
check_quotas(ctx, size, existing_object.as_ref()).await?;
|
||||
|
||||
let etag = encryption.etag_from_md5(&checksums.md5);
|
||||
let inline_data = encryption.encrypt_blob(&first_block)?.to_vec();
|
||||
|
||||
let object_version = ObjectVersion {
|
||||
uuid: version_uuid,
|
||||
timestamp: version_timestamp,
|
||||
state: ObjectVersionState::Complete(ObjectVersionData::Inline(
|
||||
ObjectVersionMeta {
|
||||
headers,
|
||||
encryption: encryption.encrypt_meta(meta)?,
|
||||
size,
|
||||
etag: data_md5sum_hex.clone(),
|
||||
etag: etag.clone(),
|
||||
},
|
||||
first_block.to_vec(),
|
||||
inline_data,
|
||||
)),
|
||||
};
|
||||
|
||||
let object = Object::new(*bucket_id, key.into(), vec![object_version]);
|
||||
garage.object_table.insert(&object).await?;
|
||||
|
||||
return Ok((version_uuid, data_md5sum_hex));
|
||||
return Ok(SaveStreamResult {
|
||||
version_uuid,
|
||||
version_timestamp,
|
||||
etag,
|
||||
});
|
||||
}
|
||||
|
||||
// The following consists in many steps that can each fail.
|
||||
|
@ -142,7 +189,8 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
|
|||
uuid: version_uuid,
|
||||
timestamp: version_timestamp,
|
||||
state: ObjectVersionState::Uploading {
|
||||
headers: headers.clone(),
|
||||
encryption: encryption.encrypt_meta(meta.clone())?,
|
||||
checksum_algorithm: None, // don't care; overwritten later
|
||||
multipart: false,
|
||||
},
|
||||
};
|
||||
|
@ -163,26 +211,39 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
|
|||
);
|
||||
garage.version_table.insert(&version).await?;
|
||||
|
||||
// Transfer data and verify checksum
|
||||
let (total_size, data_md5sum, data_sha256sum, first_block_hash) =
|
||||
read_and_put_blocks(ctx, &version, 1, first_block, &mut chunker).await?;
|
||||
// Transfer data
|
||||
let (total_size, checksums, first_block_hash) = read_and_put_blocks(
|
||||
ctx,
|
||||
&version,
|
||||
encryption,
|
||||
1,
|
||||
first_block,
|
||||
&mut chunker,
|
||||
checksummer,
|
||||
)
|
||||
.await?;
|
||||
|
||||
ensure_checksum_matches(
|
||||
data_md5sum.as_slice(),
|
||||
data_sha256sum,
|
||||
content_md5.as_deref(),
|
||||
content_sha256,
|
||||
)?;
|
||||
// Verify checksums are ok / add calculated checksum to metadata
|
||||
match checksum_mode {
|
||||
ChecksumMode::Verify(expected) => {
|
||||
checksums.verify(&expected)?;
|
||||
}
|
||||
ChecksumMode::Calculate(algo) => {
|
||||
meta.checksum = checksums.extract(algo);
|
||||
}
|
||||
};
|
||||
|
||||
// Verify quotas are respsected
|
||||
check_quotas(ctx, total_size, existing_object.as_ref()).await?;
|
||||
|
||||
// Save final object state, marked as Complete
|
||||
let md5sum_hex = hex::encode(data_md5sum);
|
||||
let etag = encryption.etag_from_md5(&checksums.md5);
|
||||
|
||||
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
|
||||
ObjectVersionMeta {
|
||||
headers,
|
||||
encryption: encryption.encrypt_meta(meta)?,
|
||||
size: total_size,
|
||||
etag: md5sum_hex.clone(),
|
||||
etag: etag.clone(),
|
||||
},
|
||||
first_block_hash,
|
||||
));
|
||||
|
@ -193,34 +254,11 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
|
|||
// We won't have to clean up on drop.
|
||||
interrupted_cleanup.cancel();
|
||||
|
||||
Ok((version_uuid, md5sum_hex))
|
||||
}
|
||||
|
||||
/// Validate MD5 sum against content-md5 header
|
||||
/// and sha256sum against signed content-sha256
|
||||
pub(crate) fn ensure_checksum_matches(
|
||||
data_md5sum: &[u8],
|
||||
data_sha256sum: garage_util::data::FixedBytes32,
|
||||
content_md5: Option<&str>,
|
||||
content_sha256: Option<garage_util::data::FixedBytes32>,
|
||||
) -> Result<(), Error> {
|
||||
if let Some(expected_sha256) = content_sha256 {
|
||||
if expected_sha256 != data_sha256sum {
|
||||
return Err(Error::bad_request(
|
||||
"Unable to validate x-amz-content-sha256",
|
||||
));
|
||||
} else {
|
||||
trace!("Successfully validated x-amz-content-sha256");
|
||||
}
|
||||
}
|
||||
if let Some(expected_md5) = content_md5 {
|
||||
if expected_md5.trim_matches('"') != BASE64_STANDARD.encode(data_md5sum) {
|
||||
return Err(Error::bad_request("Unable to validate content-md5"));
|
||||
} else {
|
||||
trace!("Successfully validated content-md5");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
Ok(SaveStreamResult {
|
||||
version_uuid,
|
||||
version_timestamp,
|
||||
etag,
|
||||
})
|
||||
}
|
||||
|
||||
/// Check that inserting this object with this size doesn't exceed bucket quotas
|
||||
|
@ -248,7 +286,7 @@ pub(crate) async fn check_quotas(
|
|||
.await?;
|
||||
|
||||
let counters = counters
|
||||
.map(|x| x.filtered_values(&garage.system.ring.borrow()))
|
||||
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
||||
.unwrap_or_default();
|
||||
|
||||
let (prev_cnt_obj, prev_cnt_size) = match prev_object {
|
||||
|
@ -290,10 +328,12 @@ pub(crate) async fn check_quotas(
|
|||
pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
|
||||
ctx: &ReqCtx,
|
||||
version: &Version,
|
||||
encryption: EncryptionParams,
|
||||
part_number: u64,
|
||||
first_block: Bytes,
|
||||
chunker: &mut StreamChunker<S>,
|
||||
) -> Result<(u64, GenericArray<u8, typenum::U16>, Hash, Hash), Error> {
|
||||
checksummer: Checksummer,
|
||||
) -> Result<(u64, Checksums, Hash), Error> {
|
||||
let tracer = opentelemetry::global::tracer("garage");
|
||||
|
||||
let (block_tx, mut block_rx) = mpsc::channel::<Result<Bytes, Error>>(2);
|
||||
|
@ -321,20 +361,20 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
|
|||
|
||||
let (block_tx2, mut block_rx2) = mpsc::channel::<Result<Bytes, Error>>(1);
|
||||
let hash_stream = async {
|
||||
let md5hasher = AsyncHasher::<Md5>::new();
|
||||
let sha256hasher = AsyncHasher::<Sha256>::new();
|
||||
let mut checksummer = checksummer;
|
||||
while let Some(next) = block_rx.recv().await {
|
||||
match next {
|
||||
Ok(block) => {
|
||||
block_tx2.send(Ok(block.clone())).await?;
|
||||
futures::future::join(
|
||||
md5hasher.update(block.clone()),
|
||||
sha256hasher.update(block.clone()),
|
||||
)
|
||||
checksummer = tokio::task::spawn_blocking(move || {
|
||||
checksummer.update(&block);
|
||||
checksummer
|
||||
})
|
||||
.with_context(Context::current_with_span(
|
||||
tracer.start("Hash block (md5, sha256)"),
|
||||
))
|
||||
.await;
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
Err(e) => {
|
||||
block_tx2.send(Err(e)).await?;
|
||||
|
@ -343,27 +383,38 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
|
|||
}
|
||||
}
|
||||
drop(block_tx2);
|
||||
Ok::<_, mpsc::error::SendError<_>>(futures::join!(
|
||||
md5hasher.finalize(),
|
||||
sha256hasher.finalize()
|
||||
))
|
||||
Ok::<_, mpsc::error::SendError<_>>(checksummer)
|
||||
};
|
||||
|
||||
let (block_tx3, mut block_rx3) = mpsc::channel::<Result<(Bytes, Hash), Error>>(1);
|
||||
let hash_blocks = async {
|
||||
let (block_tx3, mut block_rx3) = mpsc::channel::<Result<(Bytes, u64, Hash), Error>>(1);
|
||||
let encrypt_hash_blocks = async {
|
||||
let mut first_block_hash = None;
|
||||
while let Some(next) = block_rx2.recv().await {
|
||||
match next {
|
||||
Ok(block) => {
|
||||
let hash = async_blake2sum(block.clone())
|
||||
let unencrypted_len = block.len() as u64;
|
||||
let res = tokio::task::spawn_blocking(move || {
|
||||
let block = encryption.encrypt_block(block)?;
|
||||
let hash = blake2sum(&block);
|
||||
Ok((block, hash))
|
||||
})
|
||||
.with_context(Context::current_with_span(
|
||||
tracer.start("Hash block (blake2)"),
|
||||
tracer.start("Encrypt and hash (blake2) block"),
|
||||
))
|
||||
.await;
|
||||
.await
|
||||
.unwrap();
|
||||
match res {
|
||||
Ok((block, hash)) => {
|
||||
if first_block_hash.is_none() {
|
||||
first_block_hash = Some(hash);
|
||||
}
|
||||
block_tx3.send(Ok((block, hash))).await?;
|
||||
block_tx3.send(Ok((block, unencrypted_len, hash))).await?;
|
||||
}
|
||||
Err(e) => {
|
||||
block_tx3.send(Err(e)).await?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
block_tx3.send(Err(e)).await?;
|
||||
|
@ -398,7 +449,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
|
|||
block_rx3.recv().await
|
||||
}
|
||||
};
|
||||
let (block, hash) = tokio::select! {
|
||||
let (block, unencrypted_len, hash) = tokio::select! {
|
||||
result = write_futs_next => {
|
||||
result?;
|
||||
continue;
|
||||
|
@ -410,17 +461,18 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
|
|||
};
|
||||
|
||||
// For next block to be written: count its size and spawn future to write it
|
||||
let offset = written_bytes;
|
||||
written_bytes += block.len() as u64;
|
||||
write_futs.push_back(put_block_and_meta(
|
||||
ctx,
|
||||
version,
|
||||
part_number,
|
||||
offset,
|
||||
written_bytes,
|
||||
hash,
|
||||
block,
|
||||
unencrypted_len,
|
||||
encryption.is_encrypted(),
|
||||
order_stream.order(written_bytes),
|
||||
));
|
||||
written_bytes += unencrypted_len;
|
||||
}
|
||||
while let Some(res) = write_futs.next().await {
|
||||
res?;
|
||||
|
@ -429,17 +481,15 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
|
|||
};
|
||||
|
||||
let (_, stream_hash_result, block_hash_result, final_result) =
|
||||
futures::join!(read_blocks, hash_stream, hash_blocks, put_blocks);
|
||||
futures::join!(read_blocks, hash_stream, encrypt_hash_blocks, put_blocks);
|
||||
|
||||
let total_size = final_result?;
|
||||
// unwrap here is ok, because if hasher failed, it is because something failed
|
||||
// later in the pipeline which already caused a return at the ? on previous line
|
||||
let (data_md5sum, data_sha256sum) = stream_hash_result.unwrap();
|
||||
let first_block_hash = block_hash_result.unwrap();
|
||||
let checksums = stream_hash_result.unwrap().finalize();
|
||||
|
||||
let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap();
|
||||
|
||||
Ok((total_size, data_md5sum, data_sha256sum, first_block_hash))
|
||||
Ok((total_size, checksums, first_block_hash))
|
||||
}
|
||||
|
||||
async fn put_block_and_meta(
|
||||
|
@ -449,6 +499,8 @@ async fn put_block_and_meta(
|
|||
offset: u64,
|
||||
hash: Hash,
|
||||
block: Bytes,
|
||||
size: u64,
|
||||
is_encrypted: bool,
|
||||
order_tag: OrderTag,
|
||||
) -> Result<(), GarageError> {
|
||||
let ReqCtx { garage, .. } = ctx;
|
||||
|
@ -459,10 +511,7 @@ async fn put_block_and_meta(
|
|||
part_number,
|
||||
offset,
|
||||
},
|
||||
VersionBlock {
|
||||
hash,
|
||||
size: block.len() as u64,
|
||||
},
|
||||
VersionBlock { hash, size },
|
||||
);
|
||||
|
||||
let block_ref = BlockRef {
|
||||
|
@ -474,7 +523,7 @@ async fn put_block_and_meta(
|
|||
futures::try_join!(
|
||||
garage
|
||||
.block_manager
|
||||
.rpc_put_block(hash, block, Some(order_tag)),
|
||||
.rpc_put_block(hash, block, is_encrypted, Some(order_tag)),
|
||||
garage.version_table.insert(&version),
|
||||
garage.block_ref_table.insert(&block_ref),
|
||||
)?;
|
||||
|
@ -517,14 +566,6 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response<ResBody> {
|
||||
Response::builder()
|
||||
.header("x-amz-version-id", hex::encode(version_uuid))
|
||||
.header("ETag", format!("\"{}\"", md5sum_hex))
|
||||
.body(empty_body())
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
struct InterruptedCleanup(Option<InterruptedCleanupInner>);
|
||||
struct InterruptedCleanupInner {
|
||||
garage: Arc<Garage>,
|
||||
|
@ -559,57 +600,35 @@ impl Drop for InterruptedCleanup {
|
|||
|
||||
// ============ helpers ============
|
||||
|
||||
pub(crate) fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> {
|
||||
Ok(headers
|
||||
.get(hyper::header::CONTENT_TYPE)
|
||||
.map(|x| x.to_str())
|
||||
.unwrap_or(Ok("blob"))?
|
||||
.to_string())
|
||||
}
|
||||
|
||||
pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<ObjectVersionHeaders, Error> {
|
||||
let content_type = get_mime_type(headers)?;
|
||||
let mut other = BTreeMap::new();
|
||||
pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<HeaderList, Error> {
|
||||
let mut ret = Vec::new();
|
||||
|
||||
// Preserve standard headers
|
||||
let standard_header = vec![
|
||||
hyper::header::CONTENT_TYPE,
|
||||
hyper::header::CACHE_CONTROL,
|
||||
hyper::header::CONTENT_DISPOSITION,
|
||||
hyper::header::CONTENT_ENCODING,
|
||||
hyper::header::CONTENT_LANGUAGE,
|
||||
hyper::header::EXPIRES,
|
||||
];
|
||||
for h in standard_header.iter() {
|
||||
if let Some(v) = headers.get(h) {
|
||||
match v.to_str() {
|
||||
Ok(v_str) => {
|
||||
other.insert(h.to_string(), v_str.to_string());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Discarding header {}, error in .to_str(): {}", h, e);
|
||||
}
|
||||
}
|
||||
for name in standard_header.iter() {
|
||||
if let Some(value) = headers.get(name) {
|
||||
ret.push((name.to_string(), value.to_str()?.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
// Preserve x-amz-meta- headers
|
||||
for (k, v) in headers.iter() {
|
||||
if k.as_str().starts_with("x-amz-meta-") {
|
||||
match std::str::from_utf8(v.as_bytes()) {
|
||||
Ok(v_str) => {
|
||||
other.insert(k.to_string(), v_str.to_string());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Discarding header {}, error in .to_str(): {}", k, e);
|
||||
}
|
||||
}
|
||||
for (name, value) in headers.iter() {
|
||||
if name.as_str().starts_with("x-amz-meta-") {
|
||||
ret.push((
|
||||
name.to_string(),
|
||||
std::str::from_utf8(value.as_bytes())?.to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ObjectVersionHeaders {
|
||||
content_type,
|
||||
other,
|
||||
})
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 {
|
||||
|
|
|
@ -131,6 +131,14 @@ pub struct CompleteMultipartUploadResult {
|
|||
pub key: Value,
|
||||
#[serde(rename = "ETag")]
|
||||
pub etag: Value,
|
||||
#[serde(rename = "ChecksumCRC32")]
|
||||
pub checksum_crc32: Option<Value>,
|
||||
#[serde(rename = "ChecksumCRC32C")]
|
||||
pub checksum_crc32c: Option<Value>,
|
||||
#[serde(rename = "ChecksumSHA1")]
|
||||
pub checksum_sha1: Option<Value>,
|
||||
#[serde(rename = "ChecksumSHA256")]
|
||||
pub checksum_sha256: Option<Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, PartialEq, Eq)]
|
||||
|
@ -197,6 +205,14 @@ pub struct PartItem {
|
|||
pub part_number: IntValue,
|
||||
#[serde(rename = "Size")]
|
||||
pub size: IntValue,
|
||||
#[serde(rename = "ChecksumCRC32")]
|
||||
pub checksum_crc32: Option<Value>,
|
||||
#[serde(rename = "ChecksumCRC32C")]
|
||||
pub checksum_crc32c: Option<Value>,
|
||||
#[serde(rename = "ChecksumSHA1")]
|
||||
pub checksum_sha1: Option<Value>,
|
||||
#[serde(rename = "ChecksumSHA256")]
|
||||
pub checksum_sha256: Option<Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, PartialEq, Eq)]
|
||||
|
@ -500,6 +516,10 @@ mod tests {
|
|||
bucket: Value("mybucket".to_string()),
|
||||
key: Value("a/plop".to_string()),
|
||||
etag: Value("\"3858f62230ac3c915f300c664312c11f-9\"".to_string()),
|
||||
checksum_crc32: None,
|
||||
checksum_crc32c: None,
|
||||
checksum_sha1: Some(Value("ZJAnHyG8PeKz9tI8UTcHrJos39A=".into())),
|
||||
checksum_sha256: None,
|
||||
};
|
||||
assert_eq!(
|
||||
to_xml_with_header(&result)?,
|
||||
|
@ -509,6 +529,7 @@ mod tests {
|
|||
<Bucket>mybucket</Bucket>\
|
||||
<Key>a/plop</Key>\
|
||||
<ETag>"3858f62230ac3c915f300c664312c11f-9"</ETag>\
|
||||
<ChecksumSHA1>ZJAnHyG8PeKz9tI8UTcHrJos39A=</ChecksumSHA1>\
|
||||
</CompleteMultipartUploadResult>"
|
||||
);
|
||||
Ok(())
|
||||
|
@ -780,12 +801,22 @@ mod tests {
|
|||
last_modified: Value("2010-11-10T20:48:34.000Z".to_string()),
|
||||
part_number: IntValue(2),
|
||||
size: IntValue(10485760),
|
||||
checksum_crc32: None,
|
||||
checksum_crc32c: None,
|
||||
checksum_sha256: Some(Value(
|
||||
"5RQ3A5uk0w7ojNjvegohch4JRBBGN/cLhsNrPzfv/hA=".into(),
|
||||
)),
|
||||
checksum_sha1: None,
|
||||
},
|
||||
PartItem {
|
||||
etag: Value("\"aaaa18db4cc2f85cedef654fccc4a4x8\"".to_string()),
|
||||
last_modified: Value("2010-11-10T20:48:33.000Z".to_string()),
|
||||
part_number: IntValue(3),
|
||||
size: IntValue(10485760),
|
||||
checksum_sha256: None,
|
||||
checksum_crc32c: None,
|
||||
checksum_crc32: Some(Value("ZJAnHyG8=".into())),
|
||||
checksum_sha1: None,
|
||||
},
|
||||
],
|
||||
initiator: Initiator {
|
||||
|
@ -820,12 +851,14 @@ mod tests {
|
|||
<LastModified>2010-11-10T20:48:34.000Z</LastModified>\
|
||||
<PartNumber>2</PartNumber>\
|
||||
<Size>10485760</Size>\
|
||||
<ChecksumSHA256>5RQ3A5uk0w7ojNjvegohch4JRBBGN/cLhsNrPzfv/hA=</ChecksumSHA256>\
|
||||
</Part>\
|
||||
<Part>\
|
||||
<ETag>"aaaa18db4cc2f85cedef654fccc4a4x8"</ETag>\
|
||||
<LastModified>2010-11-10T20:48:33.000Z</LastModified>\
|
||||
<PartNumber>3</PartNumber>\
|
||||
<Size>10485760</Size>\
|
||||
<ChecksumCRC32>ZJAnHyG8=</ChecksumCRC32>\
|
||||
</Part>\
|
||||
<Initiator>\
|
||||
<DisplayName>umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx</DisplayName>\
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage_block"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
|
|
@ -96,7 +96,7 @@ impl DataBlock {
|
|||
}
|
||||
}
|
||||
|
||||
fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> {
|
||||
pub fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> {
|
||||
let mut result = Vec::<u8>::new();
|
||||
let mut encoder = Encoder::new(&mut result, level)?;
|
||||
encoder.include_checksum(true)?;
|
||||
|
|
|
@ -9,3 +9,6 @@ mod block;
|
|||
mod layout;
|
||||
mod metrics;
|
||||
mod rc;
|
||||
|
||||
pub use block::zstd_encode;
|
||||
pub use rc::CalculateRefcount;
|
||||
|
|
|
@ -89,7 +89,7 @@ pub struct BlockManager {
|
|||
|
||||
mutation_lock: Vec<Mutex<BlockManagerLocked>>,
|
||||
|
||||
pub(crate) rc: BlockRc,
|
||||
pub rc: BlockRc,
|
||||
pub resync: BlockResyncManager,
|
||||
|
||||
pub(crate) system: Arc<System>,
|
||||
|
@ -158,7 +158,7 @@ impl BlockManager {
|
|||
|
||||
let metrics = BlockManagerMetrics::new(
|
||||
config.compression_level,
|
||||
rc.rc.clone(),
|
||||
rc.rc_table.clone(),
|
||||
resync.queue.clone(),
|
||||
resync.errors.clone(),
|
||||
buffer_kb_semaphore.clone(),
|
||||
|
@ -233,6 +233,12 @@ impl BlockManager {
|
|||
}
|
||||
}
|
||||
|
||||
/// Initialization: set how block references are recalculated
|
||||
/// for repair operations
|
||||
pub fn set_recalc_rc(&self, recalc: Vec<CalculateRefcount>) {
|
||||
self.rc.recalc_rc.store(Some(Arc::new(recalc)));
|
||||
}
|
||||
|
||||
/// Ask nodes that might have a (possibly compressed) block for it
|
||||
/// Return it as a stream with a header
|
||||
async fn rpc_get_raw_block_streaming(
|
||||
|
@ -279,8 +285,10 @@ impl BlockManager {
|
|||
F: Fn(DataBlockStream) -> Fut,
|
||||
Fut: futures::Future<Output = Result<T, Error>>,
|
||||
{
|
||||
let who = self.replication.read_nodes(hash);
|
||||
let who = self.system.rpc.request_order(&who);
|
||||
let who = self
|
||||
.system
|
||||
.rpc_helper()
|
||||
.block_read_nodes_of(hash, self.system.rpc_helper());
|
||||
|
||||
for node in who.iter() {
|
||||
let node_id = NodeID::from(*node);
|
||||
|
@ -320,15 +328,15 @@ impl BlockManager {
|
|||
// if the first one doesn't succeed rapidly
|
||||
// TODO: keep first request running when initiating a new one and take the
|
||||
// one that finishes earlier
|
||||
_ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => {
|
||||
_ = tokio::time::sleep(self.system.rpc_helper().rpc_timeout()) => {
|
||||
debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let msg = format!("Get block {:?}: no node returned a valid block", hash);
|
||||
debug!("{}", msg);
|
||||
Err(Error::Message(msg))
|
||||
let err = Error::MissingBlock(*hash);
|
||||
debug!("{}", err);
|
||||
Err(err)
|
||||
}
|
||||
|
||||
// ---- Public interface ----
|
||||
|
@ -355,26 +363,18 @@ impl BlockManager {
|
|||
}
|
||||
}
|
||||
|
||||
/// Ask nodes that might have a block for it, return it as one big Bytes
|
||||
pub async fn rpc_get_block(
|
||||
&self,
|
||||
hash: &Hash,
|
||||
order_tag: Option<OrderTag>,
|
||||
) -> Result<Bytes, Error> {
|
||||
let stream = self.rpc_get_block_streaming(hash, order_tag).await?;
|
||||
Ok(read_stream_to_end(stream).await?.into_bytes())
|
||||
}
|
||||
|
||||
/// Send block to nodes that should have it
|
||||
pub async fn rpc_put_block(
|
||||
&self,
|
||||
hash: Hash,
|
||||
data: Bytes,
|
||||
prevent_compression: bool,
|
||||
order_tag: Option<OrderTag>,
|
||||
) -> Result<(), Error> {
|
||||
let who = self.replication.write_nodes(&hash);
|
||||
let who = self.replication.write_sets(&hash);
|
||||
|
||||
let (header, bytes) = DataBlock::from_buffer(data, self.compression_level)
|
||||
let compression_level = self.compression_level.filter(|_| !prevent_compression);
|
||||
let (header, bytes) = DataBlock::from_buffer(data, compression_level)
|
||||
.await
|
||||
.into_parts();
|
||||
|
||||
|
@ -394,10 +394,10 @@ impl BlockManager {
|
|||
};
|
||||
|
||||
self.system
|
||||
.rpc
|
||||
.try_call_many(
|
||||
.rpc_helper()
|
||||
.try_write_many_sets(
|
||||
&self.endpoint,
|
||||
&who[..],
|
||||
who.as_ref(),
|
||||
put_block_rpc,
|
||||
RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
|
||||
.with_drop_on_completion(permit)
|
||||
|
@ -410,12 +410,7 @@ impl BlockManager {
|
|||
|
||||
/// Get number of items in the refcount table
|
||||
pub fn rc_len(&self) -> Result<usize, Error> {
|
||||
Ok(self.rc.rc.len()?)
|
||||
}
|
||||
|
||||
/// Get number of items in the refcount table
|
||||
pub fn rc_fast_len(&self) -> Result<Option<usize>, Error> {
|
||||
Ok(self.rc.rc.fast_len()?)
|
||||
Ok(self.rc.rc_table.len()?)
|
||||
}
|
||||
|
||||
/// Send command to start/stop/manager scrub worker
|
||||
|
@ -433,7 +428,7 @@ impl BlockManager {
|
|||
|
||||
/// List all resync errors
|
||||
pub fn list_resync_errors(&self) -> Result<Vec<BlockResyncErrorInfo>, Error> {
|
||||
let mut blocks = Vec::with_capacity(self.resync.errors.len());
|
||||
let mut blocks = Vec::with_capacity(self.resync.errors.len()?);
|
||||
for ent in self.resync.errors.iter()? {
|
||||
let (hash, cnt) = ent?;
|
||||
let cnt = ErrorCounter::decode(&cnt);
|
||||
|
@ -471,7 +466,7 @@ impl BlockManager {
|
|||
tokio::spawn(async move {
|
||||
if let Err(e) = this
|
||||
.resync
|
||||
.put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout())
|
||||
.put_to_resync(&hash, 2 * this.system.rpc_helper().rpc_timeout())
|
||||
{
|
||||
error!("Block {:?} could not be put in resync queue: {}.", hash, e);
|
||||
}
|
||||
|
@ -565,7 +560,7 @@ impl BlockManager {
|
|||
None => {
|
||||
// Not found but maybe we should have had it ??
|
||||
self.resync
|
||||
.put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?;
|
||||
.put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
|
||||
return Err(Error::Message(format!(
|
||||
"block {:?} not found on node",
|
||||
hash
|
||||
|
|
|
@ -5,7 +5,6 @@ use tokio::sync::Semaphore;
|
|||
use opentelemetry::{global, metrics::*};
|
||||
|
||||
use garage_db as db;
|
||||
use garage_db::counted_tree_hack::CountedTree;
|
||||
|
||||
/// TableMetrics reference all counter used for metrics
|
||||
pub struct BlockManagerMetrics {
|
||||
|
@ -34,8 +33,8 @@ impl BlockManagerMetrics {
|
|||
pub fn new(
|
||||
compression_level: Option<i32>,
|
||||
rc_tree: db::Tree,
|
||||
resync_queue: CountedTree,
|
||||
resync_errors: CountedTree,
|
||||
resync_queue: db::Tree,
|
||||
resync_errors: db::Tree,
|
||||
buffer_semaphore: Arc<Semaphore>,
|
||||
) -> Self {
|
||||
let meter = global::meter("garage_model/block");
|
||||
|
@ -51,15 +50,17 @@ impl BlockManagerMetrics {
|
|||
.init(),
|
||||
_rc_size: meter
|
||||
.u64_value_observer("block.rc_size", move |observer| {
|
||||
if let Ok(Some(v)) = rc_tree.fast_len() {
|
||||
observer.observe(v as u64, &[])
|
||||
if let Ok(value) = rc_tree.len() {
|
||||
observer.observe(value as u64, &[])
|
||||
}
|
||||
})
|
||||
.with_description("Number of blocks known to the reference counter")
|
||||
.init(),
|
||||
_resync_queue_len: meter
|
||||
.u64_value_observer("block.resync_queue_length", move |observer| {
|
||||
observer.observe(resync_queue.len() as u64, &[])
|
||||
if let Ok(value) = resync_queue.len() {
|
||||
observer.observe(value as u64, &[]);
|
||||
}
|
||||
})
|
||||
.with_description(
|
||||
"Number of block hashes queued for local check and possible resync",
|
||||
|
@ -67,7 +68,9 @@ impl BlockManagerMetrics {
|
|||
.init(),
|
||||
_resync_errored_blocks: meter
|
||||
.u64_value_observer("block.resync_errored_blocks", move |observer| {
|
||||
observer.observe(resync_errors.len() as u64, &[])
|
||||
if let Ok(value) = resync_errors.len() {
|
||||
observer.observe(value as u64, &[]);
|
||||
}
|
||||
})
|
||||
.with_description("Number of block hashes whose last resync resulted in an error")
|
||||
.init(),
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
use std::convert::TryInto;
|
||||
|
||||
use arc_swap::ArcSwapOption;
|
||||
|
||||
use garage_db as db;
|
||||
|
||||
use garage_util::data::*;
|
||||
|
@ -8,13 +10,20 @@ use garage_util::time::*;
|
|||
|
||||
use crate::manager::BLOCK_GC_DELAY;
|
||||
|
||||
pub type CalculateRefcount =
|
||||
Box<dyn Fn(&db::Transaction, &Hash) -> db::TxResult<usize, Error> + Send + Sync>;
|
||||
|
||||
pub struct BlockRc {
|
||||
pub(crate) rc: db::Tree,
|
||||
pub rc_table: db::Tree,
|
||||
pub(crate) recalc_rc: ArcSwapOption<Vec<CalculateRefcount>>,
|
||||
}
|
||||
|
||||
impl BlockRc {
|
||||
pub(crate) fn new(rc: db::Tree) -> Self {
|
||||
Self { rc }
|
||||
Self {
|
||||
rc_table: rc,
|
||||
recalc_rc: ArcSwapOption::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Increment the reference counter associated to a hash.
|
||||
|
@ -24,9 +33,9 @@ impl BlockRc {
|
|||
tx: &mut db::Transaction,
|
||||
hash: &Hash,
|
||||
) -> db::TxOpResult<bool> {
|
||||
let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?);
|
||||
let old_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?);
|
||||
match old_rc.increment().serialize() {
|
||||
Some(x) => tx.insert(&self.rc, hash, x)?,
|
||||
Some(x) => tx.insert(&self.rc_table, hash, x)?,
|
||||
None => unreachable!(),
|
||||
};
|
||||
Ok(old_rc.is_zero())
|
||||
|
@ -39,28 +48,28 @@ impl BlockRc {
|
|||
tx: &mut db::Transaction,
|
||||
hash: &Hash,
|
||||
) -> db::TxOpResult<bool> {
|
||||
let new_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?).decrement();
|
||||
let new_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?).decrement();
|
||||
match new_rc.serialize() {
|
||||
Some(x) => tx.insert(&self.rc, hash, x)?,
|
||||
None => tx.remove(&self.rc, hash)?,
|
||||
Some(x) => tx.insert(&self.rc_table, hash, x)?,
|
||||
None => tx.remove(&self.rc_table, hash)?,
|
||||
};
|
||||
Ok(matches!(new_rc, RcEntry::Deletable { .. }))
|
||||
}
|
||||
|
||||
/// Read a block's reference count
|
||||
pub(crate) fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
|
||||
Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
|
||||
Ok(RcEntry::parse_opt(self.rc_table.get(hash.as_ref())?))
|
||||
}
|
||||
|
||||
/// Delete an entry in the RC table if it is deletable and the
|
||||
/// deletion time has passed
|
||||
pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
|
||||
let now = now_msec();
|
||||
self.rc.db().transaction(|tx| {
|
||||
let rcval = RcEntry::parse_opt(tx.get(&self.rc, hash)?);
|
||||
self.rc_table.db().transaction(|tx| {
|
||||
let rcval = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?);
|
||||
match rcval {
|
||||
RcEntry::Deletable { at_time } if now > at_time => {
|
||||
tx.remove(&self.rc, hash)?;
|
||||
tx.remove(&self.rc_table, hash)?;
|
||||
}
|
||||
_ => (),
|
||||
};
|
||||
|
@ -68,6 +77,58 @@ impl BlockRc {
|
|||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recalculate the reference counter of a block
|
||||
/// to fix potential inconsistencies
|
||||
pub fn recalculate_rc(&self, hash: &Hash) -> Result<(usize, bool), Error> {
|
||||
if let Some(recalc_fns) = self.recalc_rc.load().as_ref() {
|
||||
trace!("Repair block RC for {:?}", hash);
|
||||
let res = self
|
||||
.rc_table
|
||||
.db()
|
||||
.transaction(|tx| {
|
||||
let mut cnt = 0;
|
||||
for f in recalc_fns.iter() {
|
||||
cnt += f(&tx, hash)?;
|
||||
}
|
||||
let old_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?);
|
||||
trace!(
|
||||
"Block RC for {:?}: stored={}, calculated={}",
|
||||
hash,
|
||||
old_rc.as_u64(),
|
||||
cnt
|
||||
);
|
||||
if cnt as u64 != old_rc.as_u64() {
|
||||
warn!(
|
||||
"Fixing inconsistent block RC for {:?}: was {}, should be {}",
|
||||
hash,
|
||||
old_rc.as_u64(),
|
||||
cnt
|
||||
);
|
||||
let new_rc = if cnt > 0 {
|
||||
RcEntry::Present { count: cnt as u64 }
|
||||
} else {
|
||||
RcEntry::Deletable {
|
||||
at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
|
||||
}
|
||||
};
|
||||
tx.insert(&self.rc_table, hash, new_rc.serialize().unwrap())?;
|
||||
Ok((cnt, true))
|
||||
} else {
|
||||
Ok((cnt, false))
|
||||
}
|
||||
})
|
||||
.map_err(Error::from);
|
||||
if let Err(e) = &res {
|
||||
error!("Failed to fix RC for block {:?}: {}", hash, e);
|
||||
}
|
||||
res
|
||||
} else {
|
||||
Err(Error::Message(
|
||||
"Block RC recalculation is not available at this point".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Describes the state of the reference counter for a block
|
||||
|
|
|
@ -107,7 +107,7 @@ impl Worker for RepairWorker {
|
|||
for entry in self
|
||||
.manager
|
||||
.rc
|
||||
.rc
|
||||
.rc_table
|
||||
.range::<&[u8], _>((start_bound, Bound::Unbounded))?
|
||||
{
|
||||
let (hash, _) = entry?;
|
||||
|
|
|
@ -15,7 +15,6 @@ use opentelemetry::{
|
|||
};
|
||||
|
||||
use garage_db as db;
|
||||
use garage_db::counted_tree_hack::CountedTree;
|
||||
|
||||
use garage_util::background::*;
|
||||
use garage_util::data::*;
|
||||
|
@ -47,9 +46,9 @@ pub(crate) const MAX_RESYNC_WORKERS: usize = 8;
|
|||
const INITIAL_RESYNC_TRANQUILITY: u32 = 2;
|
||||
|
||||
pub struct BlockResyncManager {
|
||||
pub(crate) queue: CountedTree,
|
||||
pub(crate) queue: db::Tree,
|
||||
pub(crate) notify: Arc<Notify>,
|
||||
pub(crate) errors: CountedTree,
|
||||
pub(crate) errors: db::Tree,
|
||||
|
||||
busy_set: BusySet,
|
||||
|
||||
|
@ -90,12 +89,10 @@ impl BlockResyncManager {
|
|||
let queue = db
|
||||
.open_tree("block_local_resync_queue")
|
||||
.expect("Unable to open block_local_resync_queue tree");
|
||||
let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue");
|
||||
|
||||
let errors = db
|
||||
.open_tree("block_local_resync_errors")
|
||||
.expect("Unable to open block_local_resync_errors tree");
|
||||
let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors");
|
||||
|
||||
let persister = PersisterShared::new(&system.metadata_dir, "resync_cfg");
|
||||
|
||||
|
@ -110,16 +107,12 @@ impl BlockResyncManager {
|
|||
|
||||
/// Get lenght of resync queue
|
||||
pub fn queue_len(&self) -> Result<usize, Error> {
|
||||
// This currently can't return an error because the CountedTree hack
|
||||
// doesn't error on .len(), but this will change when we remove the hack
|
||||
// (hopefully someday!)
|
||||
Ok(self.queue.len())
|
||||
Ok(self.queue.len()?)
|
||||
}
|
||||
|
||||
/// Get number of blocks that have an error
|
||||
pub fn errors_len(&self) -> Result<usize, Error> {
|
||||
// (see queue_len comment)
|
||||
Ok(self.errors.len())
|
||||
Ok(self.errors.len()?)
|
||||
}
|
||||
|
||||
/// Clear the error counter for a block and put it in queue immediately
|
||||
|
@ -180,7 +173,7 @@ impl BlockResyncManager {
|
|||
// deleted once the garbage collection delay has passed.
|
||||
//
|
||||
// Here are some explanations on how the resync queue works.
|
||||
// There are two Sled trees that are used to have information
|
||||
// There are two db trees that are used to have information
|
||||
// about the status of blocks that need to be resynchronized:
|
||||
//
|
||||
// - resync.queue: a tree that is ordered first by a timestamp
|
||||
|
@ -374,10 +367,17 @@ impl BlockResyncManager {
|
|||
}
|
||||
|
||||
if exists && rc.is_deletable() {
|
||||
if manager.rc.recalculate_rc(hash)?.0 > 0 {
|
||||
return Err(Error::Message(format!(
|
||||
"Refcount for block {:?} was inconsistent, retrying later",
|
||||
hash
|
||||
)));
|
||||
}
|
||||
|
||||
info!("Resync block {:?}: offloading and deleting", hash);
|
||||
let existing_path = existing_path.unwrap();
|
||||
|
||||
let mut who = manager.replication.write_nodes(hash);
|
||||
let mut who = manager.replication.storage_nodes(hash);
|
||||
if who.len() < manager.replication.write_quorum() {
|
||||
return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
|
||||
}
|
||||
|
@ -385,7 +385,7 @@ impl BlockResyncManager {
|
|||
|
||||
let who_needs_resps = manager
|
||||
.system
|
||||
.rpc
|
||||
.rpc_helper()
|
||||
.call_many(
|
||||
&manager.endpoint,
|
||||
&who,
|
||||
|
@ -431,10 +431,10 @@ impl BlockResyncManager {
|
|||
.with_stream_from_buffer(bytes);
|
||||
manager
|
||||
.system
|
||||
.rpc
|
||||
.rpc_helper()
|
||||
.try_call_many(
|
||||
&manager.endpoint,
|
||||
&need_nodes[..],
|
||||
&need_nodes,
|
||||
put_block_message,
|
||||
RequestStrategy::with_priority(PRIO_BACKGROUND | PRIO_SECONDARY)
|
||||
.with_quorum(need_nodes.len()),
|
||||
|
@ -462,7 +462,15 @@ impl BlockResyncManager {
|
|||
|
||||
let block_data = manager
|
||||
.rpc_get_raw_block(hash, PRIO_BACKGROUND | PRIO_SECONDARY, None)
|
||||
.await?;
|
||||
.await;
|
||||
if matches!(block_data, Err(Error::MissingBlock(_))) {
|
||||
warn!(
|
||||
"Could not fetch needed block {:?}, no node returned valid data. Checking that refcount is correct.",
|
||||
hash
|
||||
);
|
||||
manager.rc.recalculate_rc(hash)?;
|
||||
}
|
||||
let block_data = block_data?;
|
||||
|
||||
manager.metrics.resync_recv_counter.add(1);
|
||||
|
||||
|
@ -543,9 +551,9 @@ impl Worker for ResyncWorker {
|
|||
Ok(WorkerState::Idle)
|
||||
}
|
||||
Err(e) => {
|
||||
// The errors that we have here are only Sled errors
|
||||
// The errors that we have here are only db errors
|
||||
// We don't really know how to handle them so just ¯\_(ツ)_/¯
|
||||
// (there is kind of an assumption that Sled won't error on us,
|
||||
// (there is kind of an assumption that the db won't error on us,
|
||||
// if it does there is not much we can do -- TODO should we just panic?)
|
||||
// Here we just give the error to the worker manager,
|
||||
// it will print it to the logs and increment a counter
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage_db"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
@ -20,13 +20,12 @@ heed = { workspace = true, optional = true }
|
|||
rusqlite = { workspace = true, optional = true, features = ["backup"] }
|
||||
r2d2 = { workspace = true, optional = true }
|
||||
r2d2_sqlite = { workspace = true, optional = true }
|
||||
sled = { workspace = true, optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
mktemp.workspace = true
|
||||
|
||||
[features]
|
||||
default = [ "sled", "lmdb", "sqlite" ]
|
||||
default = [ "lmdb", "sqlite" ]
|
||||
bundled-libs = [ "rusqlite?/bundled" ]
|
||||
lmdb = [ "heed" ]
|
||||
sqlite = [ "rusqlite", "r2d2", "r2d2_sqlite" ]
|
||||
|
|
|
@ -1,127 +0,0 @@
|
|||
//! This hack allows a db tree to keep in RAM a counter of the number of entries
|
||||
//! it contains, which is used to call .len() on it. This is usefull only for
|
||||
//! the sled backend where .len() otherwise would have to traverse the whole
|
||||
//! tree to count items. For sqlite and lmdb, this is mostly useless (but
|
||||
//! hopefully not harmfull!). Note that a CountedTree cannot be part of a
|
||||
//! transaction.
|
||||
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
use crate::{Result, Tree, TxError, Value, ValueIter};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CountedTree(Arc<CountedTreeInternal>);
|
||||
|
||||
struct CountedTreeInternal {
|
||||
tree: Tree,
|
||||
len: AtomicUsize,
|
||||
}
|
||||
|
||||
impl CountedTree {
|
||||
pub fn new(tree: Tree) -> Result<Self> {
|
||||
let len = tree.len()?;
|
||||
Ok(Self(Arc::new(CountedTreeInternal {
|
||||
tree,
|
||||
len: AtomicUsize::new(len),
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
|
||||
self.0.tree.get(key)
|
||||
}
|
||||
|
||||
pub fn first(&self) -> Result<Option<(Value, Value)>> {
|
||||
self.0.tree.first()
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Result<ValueIter<'_>> {
|
||||
self.0.tree.iter()
|
||||
}
|
||||
|
||||
// ---- writing functions ----
|
||||
|
||||
pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<Value>>
|
||||
where
|
||||
K: AsRef<[u8]>,
|
||||
V: AsRef<[u8]>,
|
||||
{
|
||||
let old_val = self.0.tree.insert(key, value)?;
|
||||
if old_val.is_none() {
|
||||
self.0.len.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
Ok(old_val)
|
||||
}
|
||||
|
||||
pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
|
||||
let old_val = self.0.tree.remove(key)?;
|
||||
if old_val.is_some() {
|
||||
self.0.len.fetch_sub(1, Ordering::SeqCst);
|
||||
}
|
||||
Ok(old_val)
|
||||
}
|
||||
|
||||
pub fn compare_and_swap<K, OV, NV>(
|
||||
&self,
|
||||
key: K,
|
||||
expected_old: Option<OV>,
|
||||
new: Option<NV>,
|
||||
) -> Result<bool>
|
||||
where
|
||||
K: AsRef<[u8]>,
|
||||
OV: AsRef<[u8]>,
|
||||
NV: AsRef<[u8]>,
|
||||
{
|
||||
let old_some = expected_old.is_some();
|
||||
let new_some = new.is_some();
|
||||
|
||||
let tx_res = self.0.tree.db().transaction(|tx| {
|
||||
let old_val = tx.get(&self.0.tree, &key)?;
|
||||
let is_same = match (&old_val, &expected_old) {
|
||||
(None, None) => true,
|
||||
(Some(x), Some(y)) if x == y.as_ref() => true,
|
||||
_ => false,
|
||||
};
|
||||
if is_same {
|
||||
match &new {
|
||||
Some(v) => {
|
||||
tx.insert(&self.0.tree, &key, v)?;
|
||||
}
|
||||
None => {
|
||||
tx.remove(&self.0.tree, &key)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
Err(TxError::Abort(()))
|
||||
}
|
||||
});
|
||||
|
||||
match tx_res {
|
||||
Ok(()) => {
|
||||
match (old_some, new_some) {
|
||||
(false, true) => {
|
||||
self.0.len.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
(true, false) => {
|
||||
self.0.len.fetch_sub(1, Ordering::SeqCst);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
Err(TxError::Abort(())) => Ok(false),
|
||||
Err(TxError::Db(e)) => Err(e),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,13 +3,9 @@ extern crate tracing;
|
|||
|
||||
#[cfg(feature = "lmdb")]
|
||||
pub mod lmdb_adapter;
|
||||
#[cfg(feature = "sled")]
|
||||
pub mod sled_adapter;
|
||||
#[cfg(feature = "sqlite")]
|
||||
pub mod sqlite_adapter;
|
||||
|
||||
pub mod counted_tree_hack;
|
||||
|
||||
pub mod open;
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -62,6 +58,7 @@ pub type Result<T> = std::result::Result<T, Error>;
|
|||
pub struct TxOpError(pub(crate) Error);
|
||||
pub type TxOpResult<T> = std::result::Result<T, TxOpError>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TxError<E> {
|
||||
Abort(E),
|
||||
Db(Error),
|
||||
|
@ -200,10 +197,6 @@ impl Tree {
|
|||
pub fn len(&self) -> Result<usize> {
|
||||
self.0.len(self.1)
|
||||
}
|
||||
#[inline]
|
||||
pub fn fast_len(&self) -> Result<Option<usize>> {
|
||||
self.0.fast_len(self.1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn first(&self) -> Result<Option<(Value, Value)>> {
|
||||
|
@ -293,6 +286,11 @@ impl<'a> Transaction<'a> {
|
|||
pub fn remove<T: AsRef<[u8]>>(&mut self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> {
|
||||
self.tx.remove(tree.1, key.as_ref())
|
||||
}
|
||||
/// Clears all values in a tree
|
||||
#[inline]
|
||||
pub fn clear(&mut self, tree: &Tree) -> TxOpResult<()> {
|
||||
self.tx.clear(tree.1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn iter(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> {
|
||||
|
@ -340,9 +338,6 @@ pub(crate) trait IDb: Send + Sync {
|
|||
|
||||
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
|
||||
fn len(&self, tree: usize) -> Result<usize>;
|
||||
fn fast_len(&self, _tree: usize) -> Result<Option<usize>> {
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>>;
|
||||
fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
|
||||
|
@ -373,6 +368,7 @@ pub(crate) trait ITx {
|
|||
|
||||
fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>>;
|
||||
fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>;
|
||||
fn clear(&mut self, tree: usize) -> TxOpResult<()>;
|
||||
|
||||
fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
|
||||
fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
|
||||
|
|
|
@ -4,6 +4,7 @@ use core::ptr::NonNull;
|
|||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::path::PathBuf;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use heed::types::ByteSlice;
|
||||
|
@ -131,10 +132,6 @@ impl IDb for LmdbDb {
|
|||
Ok(tree.len(&tx)?.try_into().unwrap())
|
||||
}
|
||||
|
||||
fn fast_len(&self, tree: usize) -> Result<Option<usize>> {
|
||||
Ok(Some(self.len(tree)?))
|
||||
}
|
||||
|
||||
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let mut tx = self.db.write_txn()?;
|
||||
|
@ -252,8 +249,9 @@ impl<'a> ITx for LmdbTx<'a> {
|
|||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
fn len(&self, _tree: usize) -> TxOpResult<usize> {
|
||||
unimplemented!(".len() in transaction not supported with LMDB backend")
|
||||
fn len(&self, tree: usize) -> TxOpResult<usize> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
Ok(tree.len(&self.tx)? as usize)
|
||||
}
|
||||
|
||||
fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
|
||||
|
@ -268,33 +266,48 @@ impl<'a> ITx for LmdbTx<'a> {
|
|||
tree.delete(&mut self.tx, key)?;
|
||||
Ok(old_val)
|
||||
}
|
||||
|
||||
fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with LMDB backend");
|
||||
fn clear(&mut self, tree: usize) -> TxOpResult<()> {
|
||||
let tree = *self.get_tree(tree)?;
|
||||
tree.clear(&mut self.tx)?;
|
||||
Ok(())
|
||||
}
|
||||
fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with LMDB backend");
|
||||
|
||||
fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = *self.get_tree(tree)?;
|
||||
Ok(Box::new(tree.iter(&self.tx)?.map(tx_iter_item)))
|
||||
}
|
||||
fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = *self.get_tree(tree)?;
|
||||
Ok(Box::new(tree.rev_iter(&self.tx)?.map(tx_iter_item)))
|
||||
}
|
||||
|
||||
fn range<'r>(
|
||||
&self,
|
||||
_tree: usize,
|
||||
_low: Bound<&'r [u8]>,
|
||||
_high: Bound<&'r [u8]>,
|
||||
tree: usize,
|
||||
low: Bound<&'r [u8]>,
|
||||
high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with LMDB backend");
|
||||
let tree = *self.get_tree(tree)?;
|
||||
Ok(Box::new(
|
||||
tree.range(&self.tx, &(low, high))?.map(tx_iter_item),
|
||||
))
|
||||
}
|
||||
fn range_rev<'r>(
|
||||
&self,
|
||||
_tree: usize,
|
||||
_low: Bound<&'r [u8]>,
|
||||
_high: Bound<&'r [u8]>,
|
||||
tree: usize,
|
||||
low: Bound<&'r [u8]>,
|
||||
high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with LMDB backend");
|
||||
let tree = *self.get_tree(tree)?;
|
||||
Ok(Box::new(
|
||||
tree.rev_range(&self.tx, &(low, high))?.map(tx_iter_item),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
// ---- iterators outside transactions ----
|
||||
// complicated, they must hold the transaction object
|
||||
// therefore a bit of unsafe code (it is a self-referential struct)
|
||||
|
||||
type IteratorItem<'a> = heed::Result<(
|
||||
<ByteSlice as BytesDecode<'a>>::DItem,
|
||||
|
@ -317,12 +330,20 @@ where
|
|||
where
|
||||
F: FnOnce(&'a RoTxn<'a>) -> Result<I>,
|
||||
{
|
||||
let mut res = TxAndIterator { tx, iter: None };
|
||||
let res = TxAndIterator { tx, iter: None };
|
||||
let mut boxed = Box::pin(res);
|
||||
|
||||
let tx = unsafe { NonNull::from(&res.tx).as_ref() };
|
||||
res.iter = Some(iterfun(tx)?);
|
||||
// This unsafe allows us to bypass lifetime checks
|
||||
let tx = unsafe { NonNull::from(&boxed.tx).as_ref() };
|
||||
let iter = iterfun(tx)?;
|
||||
|
||||
Ok(Box::new(res))
|
||||
let mut_ref = Pin::as_mut(&mut boxed);
|
||||
// This unsafe allows us to write in a field of the pinned struct
|
||||
unsafe {
|
||||
Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
|
||||
}
|
||||
|
||||
Ok(Box::new(TxAndIteratorPin(boxed)))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -331,18 +352,26 @@ where
|
|||
I: Iterator<Item = IteratorItem<'a>> + 'a,
|
||||
{
|
||||
fn drop(&mut self) {
|
||||
// ensure the iterator is dropped before the RoTxn it references
|
||||
drop(self.iter.take());
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for TxAndIterator<'a, I>
|
||||
struct TxAndIteratorPin<'a, I>(Pin<Box<TxAndIterator<'a, I>>>)
|
||||
where
|
||||
I: Iterator<Item = IteratorItem<'a>> + 'a;
|
||||
|
||||
impl<'a, I> Iterator for TxAndIteratorPin<'a, I>
|
||||
where
|
||||
I: Iterator<Item = IteratorItem<'a>> + 'a,
|
||||
{
|
||||
type Item = Result<(Value, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.as_mut().unwrap().next() {
|
||||
let mut_ref = Pin::as_mut(&mut self.0);
|
||||
// This unsafe allows us to mutably access the iterator field
|
||||
let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
|
||||
match next {
|
||||
None => None,
|
||||
Some(Err(e)) => Some(Err(e.into())),
|
||||
Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))),
|
||||
|
@ -350,7 +379,16 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
// ---- iterators within transactions ----
|
||||
|
||||
fn tx_iter_item<'a>(
|
||||
item: std::result::Result<(&'a [u8], &'a [u8]), heed::Error>,
|
||||
) -> TxOpResult<(Vec<u8>, Vec<u8>)> {
|
||||
item.map(|(k, v)| (k.to_vec(), v.to_vec()))
|
||||
.map_err(|e| TxOpError(Error::from(e)))
|
||||
}
|
||||
|
||||
// ---- utility ----
|
||||
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
pub fn recommended_map_size() -> usize {
|
||||
|
|
|
@ -11,7 +11,6 @@ use crate::{Db, Error, Result};
|
|||
pub enum Engine {
|
||||
Lmdb,
|
||||
Sqlite,
|
||||
Sled,
|
||||
}
|
||||
|
||||
impl Engine {
|
||||
|
@ -20,7 +19,6 @@ impl Engine {
|
|||
match self {
|
||||
Self::Lmdb => "lmdb",
|
||||
Self::Sqlite => "sqlite",
|
||||
Self::Sled => "sled",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -38,10 +36,10 @@ impl std::str::FromStr for Engine {
|
|||
match text {
|
||||
"lmdb" | "heed" => Ok(Self::Lmdb),
|
||||
"sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite),
|
||||
"sled" => Ok(Self::Sled),
|
||||
"sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.4).".into())),
|
||||
kind => Err(Error(
|
||||
format!(
|
||||
"Invalid DB engine: {} (options are: lmdb, sled, sqlite)",
|
||||
"Invalid DB engine: {} (options are: lmdb, sqlite)",
|
||||
kind
|
||||
)
|
||||
.into(),
|
||||
|
@ -53,8 +51,6 @@ impl std::str::FromStr for Engine {
|
|||
pub struct OpenOpt {
|
||||
pub fsync: bool,
|
||||
pub lmdb_map_size: Option<usize>,
|
||||
pub sled_cache_capacity: usize,
|
||||
pub sled_flush_every_ms: u64,
|
||||
}
|
||||
|
||||
impl Default for OpenOpt {
|
||||
|
@ -62,31 +58,12 @@ impl Default for OpenOpt {
|
|||
Self {
|
||||
fsync: false,
|
||||
lmdb_map_size: None,
|
||||
sled_cache_capacity: 1024 * 1024 * 1024,
|
||||
sled_flush_every_ms: 2000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result<Db> {
|
||||
match engine {
|
||||
// ---- Sled DB ----
|
||||
#[cfg(feature = "sled")]
|
||||
Engine::Sled => {
|
||||
if opt.fsync {
|
||||
return Err(Error(
|
||||
"`metadata_fsync = true` is not supported with the Sled database engine".into(),
|
||||
));
|
||||
}
|
||||
info!("Opening Sled database at: {}", path.display());
|
||||
let db = crate::sled_adapter::sled::Config::default()
|
||||
.path(&path)
|
||||
.cache_capacity(opt.sled_cache_capacity as u64)
|
||||
.flush_every_ms(Some(opt.sled_flush_every_ms))
|
||||
.open()?;
|
||||
Ok(crate::sled_adapter::SledDb::init(db))
|
||||
}
|
||||
|
||||
// ---- Sqlite DB ----
|
||||
#[cfg(feature = "sqlite")]
|
||||
Engine::Sqlite => {
|
||||
|
|
|
@ -1,282 +0,0 @@
|
|||
use core::ops::Bound;
|
||||
|
||||
use std::cell::Cell;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use sled::transaction::{
|
||||
ConflictableTransactionError, TransactionError, Transactional, TransactionalTree,
|
||||
UnabortableTransactionError,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult,
|
||||
TxResult, TxValueIter, Value, ValueIter,
|
||||
};
|
||||
|
||||
pub use sled;
|
||||
|
||||
// -- err
|
||||
|
||||
impl From<sled::Error> for Error {
|
||||
fn from(e: sled::Error) -> Error {
|
||||
Error(format!("Sled: {}", e).into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<sled::Error> for TxOpError {
|
||||
fn from(e: sled::Error) -> TxOpError {
|
||||
TxOpError(e.into())
|
||||
}
|
||||
}
|
||||
|
||||
// -- db
|
||||
|
||||
pub struct SledDb {
|
||||
db: sled::Db,
|
||||
trees: RwLock<(Vec<sled::Tree>, HashMap<String, usize>)>,
|
||||
}
|
||||
|
||||
impl SledDb {
|
||||
#[deprecated(
|
||||
since = "0.9.0",
|
||||
note = "The Sled database is now deprecated and will be removed in Garage v1.0. Please migrate to LMDB or Sqlite as soon as possible."
|
||||
)]
|
||||
pub fn init(db: sled::Db) -> Db {
|
||||
tracing::warn!("-------------------- IMPORTANT WARNING !!! ----------------------");
|
||||
tracing::warn!("The Sled database is now deprecated and will be removed in Garage v1.0.");
|
||||
tracing::warn!("Please migrate to LMDB or Sqlite as soon as possible.");
|
||||
tracing::warn!("-----------------------------------------------------------------------");
|
||||
let s = Self {
|
||||
db,
|
||||
trees: RwLock::new((Vec::new(), HashMap::new())),
|
||||
};
|
||||
Db(Arc::new(s))
|
||||
}
|
||||
|
||||
fn get_tree(&self, i: usize) -> Result<sled::Tree> {
|
||||
self.trees
|
||||
.read()
|
||||
.unwrap()
|
||||
.0
|
||||
.get(i)
|
||||
.cloned()
|
||||
.ok_or_else(|| Error("invalid tree id".into()))
|
||||
}
|
||||
}
|
||||
|
||||
impl IDb for SledDb {
|
||||
fn engine(&self) -> String {
|
||||
"Sled".into()
|
||||
}
|
||||
|
||||
fn open_tree(&self, name: &str) -> Result<usize> {
|
||||
let mut trees = self.trees.write().unwrap();
|
||||
if let Some(i) = trees.1.get(name) {
|
||||
Ok(*i)
|
||||
} else {
|
||||
let tree = self.db.open_tree(name)?;
|
||||
let i = trees.0.len();
|
||||
trees.0.push(tree);
|
||||
trees.1.insert(name.to_string(), i);
|
||||
Ok(i)
|
||||
}
|
||||
}
|
||||
|
||||
fn list_trees(&self) -> Result<Vec<String>> {
|
||||
let mut trees = vec![];
|
||||
for name in self.db.tree_names() {
|
||||
let name = std::str::from_utf8(&name)
|
||||
.map_err(|e| Error(format!("{}", e).into()))?
|
||||
.to_string();
|
||||
if name != "__sled__default" {
|
||||
trees.push(name);
|
||||
}
|
||||
}
|
||||
Ok(trees)
|
||||
}
|
||||
|
||||
fn snapshot(&self, to: &PathBuf) -> Result<()> {
|
||||
let to_db = sled::open(to)?;
|
||||
let export = self.db.export();
|
||||
to_db.import(export);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ----
|
||||
|
||||
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let val = tree.get(key)?;
|
||||
Ok(val.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn len(&self, tree: usize) -> Result<usize> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
Ok(tree.len())
|
||||
}
|
||||
|
||||
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let old_val = tree.insert(key, value)?;
|
||||
Ok(old_val.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let old_val = tree.remove(key)?;
|
||||
Ok(old_val.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn clear(&self, tree: usize) -> Result<()> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
tree.clear()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
Ok(Box::new(tree.iter().map(|v| {
|
||||
v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
|
||||
})))
|
||||
}
|
||||
|
||||
fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
Ok(Box::new(tree.iter().rev().map(|v| {
|
||||
v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
|
||||
})))
|
||||
}
|
||||
|
||||
fn range<'r>(
|
||||
&self,
|
||||
tree: usize,
|
||||
low: Bound<&'r [u8]>,
|
||||
high: Bound<&'r [u8]>,
|
||||
) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| {
|
||||
v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
|
||||
})))
|
||||
}
|
||||
fn range_rev<'r>(
|
||||
&self,
|
||||
tree: usize,
|
||||
low: Bound<&'r [u8]>,
|
||||
high: Bound<&'r [u8]>,
|
||||
) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map(
|
||||
|v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into),
|
||||
)))
|
||||
}
|
||||
|
||||
// ----
|
||||
|
||||
fn transaction(&self, f: &dyn ITxFn) -> TxResult<OnCommit, ()> {
|
||||
let trees = self.trees.read().unwrap();
|
||||
let res = trees.0.transaction(|txtrees| {
|
||||
let mut tx = SledTx {
|
||||
trees: txtrees,
|
||||
err: Cell::new(None),
|
||||
};
|
||||
match f.try_on(&mut tx) {
|
||||
TxFnResult::Ok(on_commit) => {
|
||||
assert!(tx.err.into_inner().is_none());
|
||||
Ok(on_commit)
|
||||
}
|
||||
TxFnResult::Abort => {
|
||||
assert!(tx.err.into_inner().is_none());
|
||||
Err(ConflictableTransactionError::Abort(()))
|
||||
}
|
||||
TxFnResult::DbErr => {
|
||||
let e = tx.err.into_inner().expect("No DB error");
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
});
|
||||
match res {
|
||||
Ok(on_commit) => Ok(on_commit),
|
||||
Err(TransactionError::Abort(())) => Err(TxError::Abort(())),
|
||||
Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
|
||||
struct SledTx<'a> {
|
||||
trees: &'a [TransactionalTree],
|
||||
err: Cell<Option<UnabortableTransactionError>>,
|
||||
}
|
||||
|
||||
impl<'a> SledTx<'a> {
|
||||
fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> {
|
||||
self.trees.get(i).ok_or_else(|| {
|
||||
TxOpError(Error(
|
||||
"invalid tree id (it might have been openned after the transaction started)".into(),
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
fn save_error<R>(
|
||||
&self,
|
||||
v: std::result::Result<R, UnabortableTransactionError>,
|
||||
) -> TxOpResult<R> {
|
||||
match v {
|
||||
Ok(x) => Ok(x),
|
||||
Err(e) => {
|
||||
let txt = format!("{}", e);
|
||||
self.err.set(Some(e));
|
||||
Err(TxOpError(Error(txt.into())))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ITx for SledTx<'a> {
|
||||
fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let tmp = self.save_error(tree.get(key))?;
|
||||
Ok(tmp.map(|x| x.to_vec()))
|
||||
}
|
||||
fn len(&self, _tree: usize) -> TxOpResult<usize> {
|
||||
unimplemented!(".len() in transaction not supported with Sled backend")
|
||||
}
|
||||
|
||||
fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let old_val = self.save_error(tree.insert(key, value))?;
|
||||
Ok(old_val.map(|x| x.to_vec()))
|
||||
}
|
||||
fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let old_val = self.save_error(tree.remove(key))?;
|
||||
Ok(old_val.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with Sled backend");
|
||||
}
|
||||
fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with Sled backend");
|
||||
}
|
||||
|
||||
fn range<'r>(
|
||||
&self,
|
||||
_tree: usize,
|
||||
_low: Bound<&'r [u8]>,
|
||||
_high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with Sled backend");
|
||||
}
|
||||
fn range_rev<'r>(
|
||||
&self,
|
||||
_tree: usize,
|
||||
_low: Bound<&'r [u8]>,
|
||||
_high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!("Iterators in transactions not supported with Sled backend");
|
||||
}
|
||||
}
|
|
@ -169,10 +169,6 @@ impl IDb for SqliteDb {
|
|||
}
|
||||
}
|
||||
|
||||
fn fast_len(&self, tree: usize) -> Result<Option<usize>> {
|
||||
Ok(Some(self.len(tree)?))
|
||||
}
|
||||
|
||||
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let db = self.db.get()?;
|
||||
|
@ -371,33 +367,64 @@ impl<'a> ITx for SqliteTx<'a> {
|
|||
|
||||
Ok(old_val)
|
||||
}
|
||||
|
||||
fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!();
|
||||
fn clear(&mut self, tree: usize) -> TxOpResult<()> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
self.tx.execute(&format!("DELETE FROM {}", tree), [])?;
|
||||
Ok(())
|
||||
}
|
||||
fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!();
|
||||
|
||||
fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree);
|
||||
TxValueIterator::make(self, &sql, [])
|
||||
}
|
||||
fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = self.get_tree(tree)?;
|
||||
let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree);
|
||||
TxValueIterator::make(self, &sql, [])
|
||||
}
|
||||
|
||||
fn range<'r>(
|
||||
&self,
|
||||
_tree: usize,
|
||||
_low: Bound<&'r [u8]>,
|
||||
_high: Bound<&'r [u8]>,
|
||||
tree: usize,
|
||||
low: Bound<&'r [u8]>,
|
||||
high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!();
|
||||
let tree = self.get_tree(tree)?;
|
||||
|
||||
let (bounds_sql, params) = bounds_sql(low, high);
|
||||
let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql);
|
||||
|
||||
let params = params
|
||||
.iter()
|
||||
.map(|x| x as &dyn rusqlite::ToSql)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref())
|
||||
}
|
||||
fn range_rev<'r>(
|
||||
&self,
|
||||
_tree: usize,
|
||||
_low: Bound<&'r [u8]>,
|
||||
_high: Bound<&'r [u8]>,
|
||||
tree: usize,
|
||||
low: Bound<&'r [u8]>,
|
||||
high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
unimplemented!();
|
||||
let tree = self.get_tree(tree)?;
|
||||
|
||||
let (bounds_sql, params) = bounds_sql(low, high);
|
||||
let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql);
|
||||
|
||||
let params = params
|
||||
.iter()
|
||||
.map(|x| x as &dyn rusqlite::ToSql)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref())
|
||||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
// ---- iterators outside transactions ----
|
||||
// complicated, they must hold the Statement and Row objects
|
||||
// therefore quite some unsafe code (it is a self-referential struct)
|
||||
|
||||
struct DbValueIterator<'a> {
|
||||
db: Connection,
|
||||
|
@ -417,17 +444,23 @@ impl<'a> DbValueIterator<'a> {
|
|||
let mut boxed = Box::pin(res);
|
||||
trace!("make iterator with sql: {}", sql);
|
||||
|
||||
// This unsafe allows us to bypass lifetime checks
|
||||
let db = unsafe { NonNull::from(&boxed.db).as_ref() };
|
||||
let stmt = db.prepare(sql)?;
|
||||
|
||||
let mut_ref = Pin::as_mut(&mut boxed);
|
||||
// This unsafe allows us to write in a field of the pinned struct
|
||||
unsafe {
|
||||
let db = NonNull::from(&boxed.db);
|
||||
let stmt = db.as_ref().prepare(sql)?;
|
||||
|
||||
let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
|
||||
Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt);
|
||||
}
|
||||
|
||||
let mut stmt = NonNull::from(&boxed.stmt);
|
||||
let iter = stmt.as_mut().as_mut().unwrap().query(args)?;
|
||||
// This unsafe allows us to bypass lifetime checks
|
||||
let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() };
|
||||
let iter = stmt.as_mut().unwrap().query(args)?;
|
||||
|
||||
let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
|
||||
let mut_ref = Pin::as_mut(&mut boxed);
|
||||
// This unsafe allows us to write in a field of the pinned struct
|
||||
unsafe {
|
||||
Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
|
||||
}
|
||||
|
||||
|
@ -449,28 +482,73 @@ impl<'a> Iterator for DbValueIteratorPin<'a> {
|
|||
type Item = Result<(Value, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let next = unsafe {
|
||||
let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0);
|
||||
Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next()
|
||||
};
|
||||
let row = match next {
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
Ok(None) => return None,
|
||||
Ok(Some(r)) => r,
|
||||
};
|
||||
let k = match row.get::<_, Vec<u8>>(0) {
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
Ok(x) => x,
|
||||
};
|
||||
let v = match row.get::<_, Vec<u8>>(1) {
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
Ok(y) => y,
|
||||
};
|
||||
Some(Ok((k, v)))
|
||||
let mut_ref = Pin::as_mut(&mut self.0);
|
||||
// This unsafe allows us to mutably access the iterator field
|
||||
let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
|
||||
iter_next_row(next)
|
||||
}
|
||||
}
|
||||
|
||||
// ----
|
||||
// ---- iterators within transactions ----
|
||||
// it's the same except we don't hold a mutex guard,
|
||||
// only a Statement and a Rows object
|
||||
|
||||
struct TxValueIterator<'a> {
|
||||
stmt: Statement<'a>,
|
||||
iter: Option<Rows<'a>>,
|
||||
_pin: PhantomPinned,
|
||||
}
|
||||
|
||||
impl<'a> TxValueIterator<'a> {
|
||||
fn make<P: rusqlite::Params>(
|
||||
tx: &'a SqliteTx<'a>,
|
||||
sql: &str,
|
||||
args: P,
|
||||
) -> TxOpResult<TxValueIter<'a>> {
|
||||
let stmt = tx.tx.prepare(sql)?;
|
||||
let res = TxValueIterator {
|
||||
stmt,
|
||||
iter: None,
|
||||
_pin: PhantomPinned,
|
||||
};
|
||||
let mut boxed = Box::pin(res);
|
||||
trace!("make iterator with sql: {}", sql);
|
||||
|
||||
// This unsafe allows us to bypass lifetime checks
|
||||
let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() };
|
||||
let iter = stmt.query(args)?;
|
||||
|
||||
let mut_ref = Pin::as_mut(&mut boxed);
|
||||
// This unsafe allows us to write in a field of the pinned struct
|
||||
unsafe {
|
||||
Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
|
||||
}
|
||||
|
||||
Ok(Box::new(TxValueIteratorPin(boxed)))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for TxValueIterator<'a> {
|
||||
fn drop(&mut self) {
|
||||
trace!("drop iter");
|
||||
drop(self.iter.take());
|
||||
}
|
||||
}
|
||||
|
||||
struct TxValueIteratorPin<'a>(Pin<Box<TxValueIterator<'a>>>);
|
||||
|
||||
impl<'a> Iterator for TxValueIteratorPin<'a> {
|
||||
type Item = TxOpResult<(Value, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut_ref = Pin::as_mut(&mut self.0);
|
||||
// This unsafe allows us to mutably access the iterator field
|
||||
let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
|
||||
iter_next_row(next)
|
||||
}
|
||||
}
|
||||
|
||||
// ---- utility ----
|
||||
|
||||
fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<Vec<u8>>) {
|
||||
let mut sql = String::new();
|
||||
|
@ -510,3 +588,25 @@ fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<V
|
|||
|
||||
(sql, params)
|
||||
}
|
||||
|
||||
fn iter_next_row<E>(
|
||||
next_row: rusqlite::Result<Option<&rusqlite::Row>>,
|
||||
) -> Option<std::result::Result<(Value, Value), E>>
|
||||
where
|
||||
E: From<rusqlite::Error>,
|
||||
{
|
||||
let row = match next_row {
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
Ok(None) => return None,
|
||||
Ok(Some(r)) => r,
|
||||
};
|
||||
let k = match row.get::<_, Vec<u8>>(0) {
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
Ok(x) => x,
|
||||
};
|
||||
let v = match row.get::<_, Vec<u8>>(1) {
|
||||
Err(e) => return Some(Err(e.into())),
|
||||
Ok(y) => y,
|
||||
};
|
||||
Some(Ok((k, v)))
|
||||
}
|
||||
|
|
|
@ -10,8 +10,13 @@ fn test_suite(db: Db) {
|
|||
let vb: &[u8] = &b"plip"[..];
|
||||
let vc: &[u8] = &b"plup"[..];
|
||||
|
||||
// ---- test simple insert/delete ----
|
||||
|
||||
assert!(tree.insert(ka, va).unwrap().is_none());
|
||||
assert_eq!(tree.get(ka).unwrap().unwrap(), va);
|
||||
assert_eq!(tree.len().unwrap(), 1);
|
||||
|
||||
// ---- test transaction logic ----
|
||||
|
||||
let res = db.transaction::<_, (), _>(|tx| {
|
||||
assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va);
|
||||
|
@ -37,6 +42,8 @@ fn test_suite(db: Db) {
|
|||
assert!(matches!(res, Err(TxError::Abort(42))));
|
||||
assert_eq!(tree.get(ka).unwrap().unwrap(), vb);
|
||||
|
||||
// ---- test iteration outside of transactions ----
|
||||
|
||||
let mut iter = tree.iter().unwrap();
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
|
||||
|
@ -73,6 +80,48 @@ fn test_suite(db: Db) {
|
|||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
|
||||
assert!(iter.next().is_none());
|
||||
drop(iter);
|
||||
|
||||
// ---- test iteration within transactions ----
|
||||
|
||||
db.transaction::<_, (), _>(|tx| {
|
||||
let mut iter = tx.iter(&tree).unwrap();
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
|
||||
assert!(iter.next().is_none());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db.transaction::<_, (), _>(|tx| {
|
||||
let mut iter = tx.range(&tree, kint..).unwrap();
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
|
||||
assert!(iter.next().is_none());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db.transaction::<_, (), _>(|tx| {
|
||||
let mut iter = tx.range_rev(&tree, ..kint).unwrap();
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
|
||||
assert!(iter.next().is_none());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db.transaction::<_, (), _>(|tx| {
|
||||
let mut iter = tx.iter_rev(&tree).unwrap();
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
|
||||
let next = iter.next().unwrap().unwrap();
|
||||
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
|
||||
assert!(iter.next().is_none());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -90,17 +139,6 @@ fn test_lmdb_db() {
|
|||
drop(path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "sled")]
|
||||
fn test_sled_db() {
|
||||
use crate::sled_adapter::SledDb;
|
||||
|
||||
let path = mktemp::Temp::new_dir().unwrap();
|
||||
let db = SledDb::init(sled::open(path.to_path_buf()).unwrap());
|
||||
test_suite(db);
|
||||
drop(path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "sqlite")]
|
||||
fn test_sqlite_db() {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
@ -42,6 +42,7 @@ tracing.workspace = true
|
|||
tracing-subscriber.workspace = true
|
||||
rand.workspace = true
|
||||
async-trait.workspace = true
|
||||
sha1.workspace = true
|
||||
sodiumoxide.workspace = true
|
||||
structopt.workspace = true
|
||||
git-version.workspace = true
|
||||
|
@ -81,12 +82,11 @@ k2v-client.workspace = true
|
|||
|
||||
|
||||
[features]
|
||||
default = [ "bundled-libs", "metrics", "sled", "lmdb", "sqlite", "k2v" ]
|
||||
default = [ "bundled-libs", "metrics", "lmdb", "sqlite", "k2v" ]
|
||||
|
||||
k2v = [ "garage_util/k2v", "garage_api/k2v" ]
|
||||
|
||||
# Database engines, Sled is still our default even though we don't like it
|
||||
sled = [ "garage_model/sled" ]
|
||||
# Database engines
|
||||
lmdb = [ "garage_model/lmdb" ]
|
||||
sqlite = [ "garage_model/sqlite" ]
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ impl AdminRpcHandler {
|
|||
.table
|
||||
.get(&bucket_id, &EmptyKey)
|
||||
.await?
|
||||
.map(|x| x.filtered_values(&self.garage.system.ring.borrow()))
|
||||
.map(|x| x.filtered_values(&self.garage.system.cluster_layout()))
|
||||
.unwrap_or_default();
|
||||
|
||||
let mpu_counters = self
|
||||
|
@ -78,7 +78,7 @@ impl AdminRpcHandler {
|
|||
.table
|
||||
.get(&bucket_id, &EmptyKey)
|
||||
.await?
|
||||
.map(|x| x.filtered_values(&self.garage.system.ring.borrow()))
|
||||
.map(|x| x.filtered_values(&self.garage.system.cluster_layout()))
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut relevant_keys = HashMap::new();
|
||||
|
|
|
@ -18,7 +18,7 @@ use garage_util::error::Error as GarageError;
|
|||
use garage_table::replication::*;
|
||||
use garage_table::*;
|
||||
|
||||
use garage_rpc::ring::PARTITION_BITS;
|
||||
use garage_rpc::layout::PARTITION_BITS;
|
||||
use garage_rpc::*;
|
||||
|
||||
use garage_block::manager::BlockResyncErrorInfo;
|
||||
|
@ -27,7 +27,6 @@ use garage_model::bucket_table::*;
|
|||
use garage_model::garage::Garage;
|
||||
use garage_model::helper::error::{Error, OkOrBadRequest};
|
||||
use garage_model::key_table::*;
|
||||
use garage_model::migrate::Migrate;
|
||||
use garage_model::s3::mpu_table::MultipartUpload;
|
||||
use garage_model::s3::version_table::Version;
|
||||
|
||||
|
@ -42,7 +41,6 @@ pub enum AdminRpc {
|
|||
BucketOperation(BucketOperation),
|
||||
KeyOperation(KeyOperation),
|
||||
LaunchRepair(RepairOpt),
|
||||
Migrate(MigrateOpt),
|
||||
Stats(StatsOpt),
|
||||
Worker(WorkerOperation),
|
||||
BlockOperation(BlockOperation),
|
||||
|
@ -96,24 +94,6 @@ impl AdminRpcHandler {
|
|||
admin
|
||||
}
|
||||
|
||||
// ================ MIGRATION COMMANDS ====================
|
||||
|
||||
async fn handle_migrate(self: &Arc<Self>, opt: MigrateOpt) -> Result<AdminRpc, Error> {
|
||||
if !opt.yes {
|
||||
return Err(Error::BadRequest(
|
||||
"Please provide the --yes flag to initiate migration operation.".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let m = Migrate {
|
||||
garage: self.garage.clone(),
|
||||
};
|
||||
match opt.what {
|
||||
MigrateWhat::Buckets050 => m.migrate_buckets050().await,
|
||||
}?;
|
||||
Ok(AdminRpc::Ok("Migration successfull.".into()))
|
||||
}
|
||||
|
||||
// ================ REPAIR COMMANDS ====================
|
||||
|
||||
async fn handle_launch_repair(self: &Arc<Self>, opt: RepairOpt) -> Result<AdminRpc, Error> {
|
||||
|
@ -127,8 +107,8 @@ impl AdminRpcHandler {
|
|||
opt_to_send.all_nodes = false;
|
||||
|
||||
let mut failures = vec![];
|
||||
let ring = self.garage.system.ring.borrow().clone();
|
||||
for node in ring.layout.node_ids().iter() {
|
||||
let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
|
||||
for node in all_nodes.iter() {
|
||||
let node = (*node).into();
|
||||
let resp = self
|
||||
.endpoint
|
||||
|
@ -164,9 +144,9 @@ impl AdminRpcHandler {
|
|||
async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRpc, Error> {
|
||||
if opt.all_nodes {
|
||||
let mut ret = String::new();
|
||||
let ring = self.garage.system.ring.borrow().clone();
|
||||
let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
|
||||
|
||||
for node in ring.layout.node_ids().iter() {
|
||||
for node in all_nodes.iter() {
|
||||
let mut opt = opt.clone();
|
||||
opt.all_nodes = false;
|
||||
opt.skip_global = true;
|
||||
|
@ -218,11 +198,11 @@ impl AdminRpcHandler {
|
|||
|
||||
// Gather table statistics
|
||||
let mut table = vec![" Table\tItems\tMklItems\tMklTodo\tGcTodo".into()];
|
||||
table.push(self.gather_table_stats(&self.garage.bucket_table, opt.detailed)?);
|
||||
table.push(self.gather_table_stats(&self.garage.key_table, opt.detailed)?);
|
||||
table.push(self.gather_table_stats(&self.garage.object_table, opt.detailed)?);
|
||||
table.push(self.gather_table_stats(&self.garage.version_table, opt.detailed)?);
|
||||
table.push(self.gather_table_stats(&self.garage.block_ref_table, opt.detailed)?);
|
||||
table.push(self.gather_table_stats(&self.garage.bucket_table)?);
|
||||
table.push(self.gather_table_stats(&self.garage.key_table)?);
|
||||
table.push(self.gather_table_stats(&self.garage.object_table)?);
|
||||
table.push(self.gather_table_stats(&self.garage.version_table)?);
|
||||
table.push(self.gather_table_stats(&self.garage.block_ref_table)?);
|
||||
write!(
|
||||
&mut ret,
|
||||
"\nTable stats:\n{}",
|
||||
|
@ -232,15 +212,7 @@ impl AdminRpcHandler {
|
|||
|
||||
// Gather block manager statistics
|
||||
writeln!(&mut ret, "\nBlock manager stats:").unwrap();
|
||||
let rc_len = if opt.detailed {
|
||||
self.garage.block_manager.rc_len()?.to_string()
|
||||
} else {
|
||||
self.garage
|
||||
.block_manager
|
||||
.rc_fast_len()?
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "NC".into())
|
||||
};
|
||||
let rc_len = self.garage.block_manager.rc_len()?.to_string();
|
||||
|
||||
writeln!(
|
||||
&mut ret,
|
||||
|
@ -261,10 +233,6 @@ impl AdminRpcHandler {
|
|||
)
|
||||
.unwrap();
|
||||
|
||||
if !opt.detailed {
|
||||
writeln!(&mut ret, "\nIf values are missing above (marked as NC), consider adding the --detailed flag (this will be slow).").unwrap();
|
||||
}
|
||||
|
||||
if !opt.skip_global {
|
||||
write!(&mut ret, "\n{}", self.gather_cluster_stats()).unwrap();
|
||||
}
|
||||
|
@ -275,11 +243,11 @@ impl AdminRpcHandler {
|
|||
fn gather_cluster_stats(&self) -> String {
|
||||
let mut ret = String::new();
|
||||
|
||||
// Gather storage node and free space statistics
|
||||
let layout = &self.garage.system.ring.borrow().layout;
|
||||
// Gather storage node and free space statistics for current nodes
|
||||
let layout = &self.garage.system.cluster_layout();
|
||||
let mut node_partition_count = HashMap::<Uuid, u64>::new();
|
||||
for short_id in layout.ring_assignment_data.iter() {
|
||||
let id = layout.node_id_vec[*short_id as usize];
|
||||
for short_id in layout.current().ring_assignment_data.iter() {
|
||||
let id = layout.current().node_id_vec[*short_id as usize];
|
||||
*node_partition_count.entry(id).or_default() += 1;
|
||||
}
|
||||
let node_info = self
|
||||
|
@ -294,8 +262,8 @@ impl AdminRpcHandler {
|
|||
for (id, parts) in node_partition_count.iter() {
|
||||
let info = node_info.get(id);
|
||||
let status = info.map(|x| &x.status);
|
||||
let role = layout.roles.get(id).and_then(|x| x.0.as_ref());
|
||||
let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?");
|
||||
let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref());
|
||||
let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?");
|
||||
let zone = role.map(|x| x.zone.as_str()).unwrap_or("?");
|
||||
let capacity = role
|
||||
.map(|x| x.capacity_string())
|
||||
|
@ -366,34 +334,13 @@ impl AdminRpcHandler {
|
|||
ret
|
||||
}
|
||||
|
||||
fn gather_table_stats<F, R>(
|
||||
&self,
|
||||
t: &Arc<Table<F, R>>,
|
||||
detailed: bool,
|
||||
) -> Result<String, Error>
|
||||
fn gather_table_stats<F, R>(&self, t: &Arc<Table<F, R>>) -> Result<String, Error>
|
||||
where
|
||||
F: TableSchema + 'static,
|
||||
R: TableReplication + 'static,
|
||||
{
|
||||
let (data_len, mkl_len) = if detailed {
|
||||
(
|
||||
t.data.store.len().map_err(GarageError::from)?.to_string(),
|
||||
t.merkle_updater.merkle_tree_len()?.to_string(),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
t.data
|
||||
.store
|
||||
.fast_len()
|
||||
.map_err(GarageError::from)?
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "NC".into()),
|
||||
t.merkle_updater
|
||||
.merkle_tree_fast_len()?
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "NC".into()),
|
||||
)
|
||||
};
|
||||
let data_len = t.data.store.len().map_err(GarageError::from)?.to_string();
|
||||
let mkl_len = t.merkle_updater.merkle_tree_len()?.to_string();
|
||||
|
||||
Ok(format!(
|
||||
" {}\t{}\t{}\t{}\t{}",
|
||||
|
@ -441,8 +388,8 @@ impl AdminRpcHandler {
|
|||
) -> Result<AdminRpc, Error> {
|
||||
if all_nodes {
|
||||
let mut ret = vec![];
|
||||
let ring = self.garage.system.ring.borrow().clone();
|
||||
for node in ring.layout.node_ids().iter() {
|
||||
let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
|
||||
for node in all_nodes.iter() {
|
||||
let node = (*node).into();
|
||||
match self
|
||||
.endpoint
|
||||
|
@ -489,8 +436,8 @@ impl AdminRpcHandler {
|
|||
) -> Result<AdminRpc, Error> {
|
||||
if all_nodes {
|
||||
let mut ret = vec![];
|
||||
let ring = self.garage.system.ring.borrow().clone();
|
||||
for node in ring.layout.node_ids().iter() {
|
||||
let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
|
||||
for node in all_nodes.iter() {
|
||||
let node = (*node).into();
|
||||
match self
|
||||
.endpoint
|
||||
|
@ -525,8 +472,7 @@ impl AdminRpcHandler {
|
|||
async fn handle_meta_cmd(self: &Arc<Self>, mo: &MetaOperation) -> Result<AdminRpc, Error> {
|
||||
match mo {
|
||||
MetaOperation::Snapshot { all: true } => {
|
||||
let ring = self.garage.system.ring.borrow().clone();
|
||||
let to = ring.layout.node_ids().to_vec();
|
||||
let to = self.garage.system.cluster_layout().all_nodes().to_vec();
|
||||
|
||||
let resps = futures::future::join_all(to.iter().map(|to| async move {
|
||||
let to = (*to).into();
|
||||
|
@ -569,7 +515,6 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
|
|||
match message {
|
||||
AdminRpc::BucketOperation(bo) => self.handle_bucket_cmd(bo).await,
|
||||
AdminRpc::KeyOperation(ko) => self.handle_key_cmd(ko).await,
|
||||
AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await,
|
||||
AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await,
|
||||
AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
|
||||
AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::time::Duration;
|
||||
|
||||
use format_table::format_table;
|
||||
|
@ -33,9 +33,6 @@ pub async fn cli_command_dispatch(
|
|||
Command::Key(ko) => {
|
||||
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::KeyOperation(ko)).await
|
||||
}
|
||||
Command::Migrate(mo) => {
|
||||
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Migrate(mo)).await
|
||||
}
|
||||
Command::Repair(ro) => {
|
||||
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await
|
||||
}
|
||||
|
@ -52,21 +49,19 @@ pub async fn cli_command_dispatch(
|
|||
}
|
||||
|
||||
pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
|
||||
let status = match rpc_cli
|
||||
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
|
||||
.await??
|
||||
{
|
||||
SystemRpc::ReturnKnownNodes(nodes) => nodes,
|
||||
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
|
||||
};
|
||||
let status = fetch_status(rpc_cli, rpc_host).await?;
|
||||
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
|
||||
println!("==== HEALTHY NODES ====");
|
||||
let mut healthy_nodes =
|
||||
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()];
|
||||
for adv in status.iter().filter(|adv| adv.is_up) {
|
||||
match layout.roles.get(&adv.id) {
|
||||
Some(NodeRoleV(Some(cfg))) => {
|
||||
let host = adv.status.hostname.as_deref().unwrap_or("?");
|
||||
let addr = match adv.addr {
|
||||
Some(addr) => addr.to_string(),
|
||||
None => "N/A".to_string(),
|
||||
};
|
||||
if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) {
|
||||
let data_avail = match &adv.status.data_disk_avail {
|
||||
_ if cfg.capacity.is_none() => "N/A".into(),
|
||||
Some((avail, total)) => {
|
||||
|
@ -79,24 +74,41 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
|
|||
healthy_nodes.push(format!(
|
||||
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}",
|
||||
id = adv.id,
|
||||
host = adv.status.hostname,
|
||||
addr = adv.addr,
|
||||
host = host,
|
||||
addr = addr,
|
||||
tags = cfg.tags.join(","),
|
||||
zone = cfg.zone,
|
||||
capacity = cfg.capacity_string(),
|
||||
data_avail = data_avail,
|
||||
));
|
||||
}
|
||||
_ => {
|
||||
let new_role = match layout.staging_roles.get(&adv.id) {
|
||||
Some(NodeRoleV(Some(_))) => "(pending)",
|
||||
} else {
|
||||
let prev_role = layout
|
||||
.versions
|
||||
.iter()
|
||||
.rev()
|
||||
.find_map(|x| match x.roles.get(&adv.id) {
|
||||
Some(NodeRoleV(Some(cfg))) => Some(cfg),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(cfg) = prev_role {
|
||||
healthy_nodes.push(format!(
|
||||
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...",
|
||||
id = adv.id,
|
||||
host = host,
|
||||
addr = addr,
|
||||
tags = cfg.tags.join(","),
|
||||
zone = cfg.zone,
|
||||
));
|
||||
} else {
|
||||
let new_role = match layout.staging.get().roles.get(&adv.id) {
|
||||
Some(NodeRoleV(Some(_))) => "pending...",
|
||||
_ => "NO ROLE ASSIGNED",
|
||||
};
|
||||
healthy_nodes.push(format!(
|
||||
"{id:?}\t{h}\t{addr}\t{new_role}",
|
||||
"{id:?}\t{h}\t{addr}\t\t\t{new_role}",
|
||||
id = adv.id,
|
||||
h = adv.status.hostname,
|
||||
addr = adv.addr,
|
||||
h = host,
|
||||
addr = addr,
|
||||
new_role = new_role,
|
||||
));
|
||||
}
|
||||
|
@ -104,51 +116,73 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
|
|||
}
|
||||
format_table(healthy_nodes);
|
||||
|
||||
let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>();
|
||||
let failure_case_1 = status
|
||||
// Determine which nodes are unhealthy and print that to stdout
|
||||
let status_map = status
|
||||
.iter()
|
||||
.any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_)))));
|
||||
let failure_case_2 = layout
|
||||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
.any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some());
|
||||
if failure_case_1 || failure_case_2 {
|
||||
println!("\n==== FAILED NODES ====");
|
||||
let mut failed_nodes =
|
||||
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
|
||||
for adv in status.iter().filter(|adv| !adv.is_up) {
|
||||
if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) {
|
||||
.map(|adv| (adv.id, adv))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let tf = timeago::Formatter::new();
|
||||
failed_nodes.push(format!(
|
||||
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
|
||||
id = adv.id,
|
||||
host = adv.status.hostname,
|
||||
addr = adv.addr,
|
||||
tags = cfg.tags.join(","),
|
||||
zone = cfg.zone,
|
||||
capacity = cfg.capacity_string(),
|
||||
last_seen = adv
|
||||
.last_seen_secs_ago
|
||||
let mut drain_msg = false;
|
||||
let mut failed_nodes = vec!["ID\tHostname\tTags\tZone\tCapacity\tLast seen".to_string()];
|
||||
let mut listed = HashSet::new();
|
||||
for ver in layout.versions.iter().rev() {
|
||||
for (node, _, role) in ver.roles.items().iter() {
|
||||
let cfg = match role {
|
||||
NodeRoleV(Some(role)) if role.capacity.is_some() => role,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
if listed.contains(node) {
|
||||
continue;
|
||||
}
|
||||
listed.insert(*node);
|
||||
|
||||
let adv = status_map.get(node);
|
||||
if adv.map(|x| x.is_up).unwrap_or(false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Node is in a layout version, is not a gateway node, and is not up:
|
||||
// it is in a failed state, add proper line to the output
|
||||
let (host, last_seen) = match adv {
|
||||
Some(adv) => (
|
||||
adv.status.hostname.as_deref().unwrap_or("?"),
|
||||
adv.last_seen_secs_ago
|
||||
.map(|s| tf.convert(Duration::from_secs(s)))
|
||||
.unwrap_or_else(|| "never seen".into()),
|
||||
));
|
||||
}
|
||||
}
|
||||
for (id, _, role_v) in layout.roles.items().iter() {
|
||||
if let NodeRoleV(Some(cfg)) = role_v {
|
||||
if !status_keys.contains(id) {
|
||||
),
|
||||
None => ("??", "never seen".into()),
|
||||
};
|
||||
let capacity = if ver.version == layout.current().version {
|
||||
cfg.capacity_string()
|
||||
} else {
|
||||
drain_msg = true;
|
||||
"draining metadata...".to_string()
|
||||
};
|
||||
failed_nodes.push(format!(
|
||||
"{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen",
|
||||
id = id,
|
||||
"{id:?}\t{host}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
|
||||
id = node,
|
||||
host = host,
|
||||
tags = cfg.tags.join(","),
|
||||
zone = cfg.zone,
|
||||
capacity = cfg.capacity_string(),
|
||||
capacity = capacity,
|
||||
last_seen = last_seen,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if failed_nodes.len() > 1 {
|
||||
println!("\n==== FAILED NODES ====");
|
||||
format_table(failed_nodes);
|
||||
if drain_msg {
|
||||
println!();
|
||||
println!("Your cluster is expecting to drain data from nodes that are currently unavailable.");
|
||||
println!("If these nodes are definitely dead, please review the layout history with");
|
||||
println!(
|
||||
"`garage layout history` and use `garage layout skip-dead-nodes` to force progress."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if print_staging_role_changes(&layout) {
|
||||
|
@ -229,3 +263,18 @@ pub async fn cmd_admin(
|
|||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ---- utility ----
|
||||
|
||||
pub async fn fetch_status(
|
||||
rpc_cli: &Endpoint<SystemRpc, ()>,
|
||||
rpc_host: NodeID,
|
||||
) -> Result<Vec<KnownNodeInfo>, Error> {
|
||||
match rpc_cli
|
||||
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
|
||||
.await??
|
||||
{
|
||||
SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),
|
||||
resp => Err(Error::unexpected_rpc_message(resp)),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ pub struct ConvertDbOpt {
|
|||
/// https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0)
|
||||
#[structopt(short = "i")]
|
||||
input_path: PathBuf,
|
||||
/// Input database engine (sled, lmdb or sqlite; limited by db engines
|
||||
/// Input database engine (lmdb or sqlite; limited by db engines
|
||||
/// enabled in this build)
|
||||
#[structopt(short = "a")]
|
||||
input_engine: Engine,
|
||||
|
|
|
@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch(
|
|||
LayoutOperation::Config(config_opt) => {
|
||||
cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await
|
||||
}
|
||||
LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await,
|
||||
LayoutOperation::SkipDeadNodes(assume_sync_opt) => {
|
||||
cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -49,6 +53,7 @@ pub async fn cmd_assign_role(
|
|||
};
|
||||
|
||||
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
let all_nodes = layout.get_all_nodes();
|
||||
|
||||
let added_nodes = args
|
||||
.node_ids
|
||||
|
@ -58,21 +63,23 @@ pub async fn cmd_assign_role(
|
|||
status
|
||||
.iter()
|
||||
.map(|adv| adv.id)
|
||||
.chain(layout.node_ids().iter().cloned()),
|
||||
.chain(all_nodes.iter().cloned()),
|
||||
node_id,
|
||||
)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let mut roles = layout.roles.clone();
|
||||
roles.merge(&layout.staging_roles);
|
||||
let mut roles = layout.current().roles.clone();
|
||||
roles.merge(&layout.staging.get().roles);
|
||||
|
||||
for replaced in args.replace.iter() {
|
||||
let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?;
|
||||
let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?;
|
||||
match roles.get(&replaced_node) {
|
||||
Some(NodeRoleV(Some(_))) => {
|
||||
layout
|
||||
.staging_roles
|
||||
.staging
|
||||
.get_mut()
|
||||
.roles
|
||||
.merge(&roles.update_mutator(replaced_node, NodeRoleV(None)));
|
||||
}
|
||||
_ => {
|
||||
|
@ -130,7 +137,9 @@ pub async fn cmd_assign_role(
|
|||
};
|
||||
|
||||
layout
|
||||
.staging_roles
|
||||
.staging
|
||||
.get_mut()
|
||||
.roles
|
||||
.merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry))));
|
||||
}
|
||||
|
||||
|
@ -149,14 +158,16 @@ pub async fn cmd_remove_role(
|
|||
) -> Result<(), Error> {
|
||||
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
|
||||
let mut roles = layout.roles.clone();
|
||||
roles.merge(&layout.staging_roles);
|
||||
let mut roles = layout.current().roles.clone();
|
||||
roles.merge(&layout.staging.get().roles);
|
||||
|
||||
let deleted_node =
|
||||
find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?;
|
||||
|
||||
layout
|
||||
.staging_roles
|
||||
.staging
|
||||
.get_mut()
|
||||
.roles
|
||||
.merge(&roles.update_mutator(deleted_node, NodeRoleV(None)));
|
||||
|
||||
send_layout(rpc_cli, rpc_host, layout).await?;
|
||||
|
@ -174,13 +185,16 @@ pub async fn cmd_show_layout(
|
|||
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
|
||||
println!("==== CURRENT CLUSTER LAYOUT ====");
|
||||
print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
|
||||
print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
|
||||
println!();
|
||||
println!("Current cluster layout version: {}", layout.version);
|
||||
println!(
|
||||
"Current cluster layout version: {}",
|
||||
layout.current().version
|
||||
);
|
||||
|
||||
let has_role_changes = print_staging_role_changes(&layout);
|
||||
if has_role_changes {
|
||||
let v = layout.version;
|
||||
let v = layout.current().version;
|
||||
let res_apply = layout.apply_staged_changes(Some(v + 1));
|
||||
|
||||
// this will print the stats of what partitions
|
||||
|
@ -189,7 +203,7 @@ pub async fn cmd_show_layout(
|
|||
Ok((layout, msg)) => {
|
||||
println!();
|
||||
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
|
||||
print_cluster_layout(&layout, "No nodes have a role in the new layout.");
|
||||
print_cluster_layout(layout.current(), "No nodes have a role in the new layout.");
|
||||
println!();
|
||||
|
||||
for line in msg.iter() {
|
||||
|
@ -199,16 +213,12 @@ pub async fn cmd_show_layout(
|
|||
println!();
|
||||
println!(" garage layout apply --version {}", v + 1);
|
||||
println!();
|
||||
println!(
|
||||
"You can also revert all proposed changes with: garage layout revert --version {}",
|
||||
v + 1)
|
||||
println!("You can also revert all proposed changes with: garage layout revert");
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error while trying to compute the assignment: {}", e);
|
||||
println!("This new layout cannot yet be applied.");
|
||||
println!(
|
||||
"You can also revert all proposed changes with: garage layout revert --version {}",
|
||||
v + 1)
|
||||
println!("You can also revert all proposed changes with: garage layout revert");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -241,9 +251,15 @@ pub async fn cmd_revert_layout(
|
|||
rpc_host: NodeID,
|
||||
revert_opt: RevertLayoutOpt,
|
||||
) -> Result<(), Error> {
|
||||
if !revert_opt.yes {
|
||||
return Err(Error::Message(
|
||||
"Please add the --yes flag to run the layout revert operation".into(),
|
||||
));
|
||||
}
|
||||
|
||||
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
|
||||
let layout = layout.revert_staged_changes(revert_opt.version)?;
|
||||
let layout = layout.revert_staged_changes()?;
|
||||
|
||||
send_layout(rpc_cli, rpc_host, layout).await?;
|
||||
|
||||
|
@ -266,11 +282,11 @@ pub async fn cmd_config_layout(
|
|||
.parse::<ZoneRedundancy>()
|
||||
.ok_or_message("invalid zone redundancy value")?;
|
||||
if let ZoneRedundancy::AtLeast(r_int) = r {
|
||||
if r_int > layout.replication_factor {
|
||||
if r_int > layout.current().replication_factor {
|
||||
return Err(Error::Message(format!(
|
||||
"The zone redundancy must be smaller or equal to the \
|
||||
replication factor ({}).",
|
||||
layout.replication_factor
|
||||
layout.current().replication_factor
|
||||
)));
|
||||
} else if r_int < 1 {
|
||||
return Err(Error::Message(
|
||||
|
@ -280,7 +296,9 @@ pub async fn cmd_config_layout(
|
|||
}
|
||||
|
||||
layout
|
||||
.staging_parameters
|
||||
.staging
|
||||
.get_mut()
|
||||
.parameters
|
||||
.update(LayoutParameters { zone_redundancy: r });
|
||||
println!("The zone redundancy parameter has been set to '{}'.", r);
|
||||
did_something = true;
|
||||
|
@ -297,25 +315,166 @@ pub async fn cmd_config_layout(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn cmd_layout_history(
|
||||
rpc_cli: &Endpoint<SystemRpc, ()>,
|
||||
rpc_host: NodeID,
|
||||
) -> Result<(), Error> {
|
||||
let layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
let min_stored = layout.min_stored();
|
||||
|
||||
println!("==== LAYOUT HISTORY ====");
|
||||
let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()];
|
||||
for ver in layout
|
||||
.versions
|
||||
.iter()
|
||||
.rev()
|
||||
.chain(layout.old_versions.iter().rev())
|
||||
{
|
||||
let status = if ver.version == layout.current().version {
|
||||
"current"
|
||||
} else if ver.version >= min_stored {
|
||||
"draining"
|
||||
} else {
|
||||
"historical"
|
||||
};
|
||||
table.push(format!(
|
||||
"#{}\t{}\t{}\t{}",
|
||||
ver.version,
|
||||
status,
|
||||
ver.roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some()))
|
||||
.count(),
|
||||
ver.roles
|
||||
.items()
|
||||
.iter()
|
||||
.filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none()))
|
||||
.count(),
|
||||
));
|
||||
}
|
||||
format_table(table);
|
||||
println!();
|
||||
|
||||
if layout.versions.len() > 1 {
|
||||
println!("==== UPDATE TRACKERS ====");
|
||||
println!("Several layout versions are currently live in the version, and data is being migrated.");
|
||||
println!(
|
||||
"This is the internal data that Garage stores to know which nodes have what data."
|
||||
);
|
||||
println!();
|
||||
let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()];
|
||||
let all_nodes = layout.get_all_nodes();
|
||||
for node in all_nodes.iter() {
|
||||
table.push(format!(
|
||||
"{:?}\t#{}\t#{}\t#{}",
|
||||
node,
|
||||
layout.update_trackers.ack_map.get(node, min_stored),
|
||||
layout.update_trackers.sync_map.get(node, min_stored),
|
||||
layout.update_trackers.sync_ack_map.get(node, min_stored),
|
||||
));
|
||||
}
|
||||
table[1..].sort();
|
||||
format_table(table);
|
||||
|
||||
println!();
|
||||
println!(
|
||||
"If some nodes are not catching up to the latest layout version in the update trackers,"
|
||||
);
|
||||
println!("it might be because they are offline or unable to complete a sync successfully.");
|
||||
println!(
|
||||
"You may force progress using `garage layout skip-dead-nodes --version {}`",
|
||||
layout.current().version
|
||||
);
|
||||
} else {
|
||||
println!("Your cluster is currently in a stable state with a single live layout version.");
|
||||
println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,");
|
||||
println!(
|
||||
"so you might want to keep old nodes online until their data directories become empty."
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn cmd_layout_skip_dead_nodes(
|
||||
rpc_cli: &Endpoint<SystemRpc, ()>,
|
||||
rpc_host: NodeID,
|
||||
opt: SkipDeadNodesOpt,
|
||||
) -> Result<(), Error> {
|
||||
let status = fetch_status(rpc_cli, rpc_host).await?;
|
||||
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
|
||||
|
||||
if layout.versions.len() == 1 {
|
||||
return Err(Error::Message(
|
||||
"This command cannot be called when there is only one live cluster layout version"
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
|
||||
let min_v = layout.min_stored();
|
||||
if opt.version <= min_v || opt.version > layout.current().version {
|
||||
return Err(Error::Message(format!(
|
||||
"Invalid version, you may use the following version numbers: {}",
|
||||
(min_v + 1..=layout.current().version)
|
||||
.map(|x| x.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
)));
|
||||
}
|
||||
|
||||
let all_nodes = layout.get_all_nodes();
|
||||
let mut did_something = false;
|
||||
for node in all_nodes.iter() {
|
||||
if status.iter().any(|x| x.id == *node && x.is_up) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if layout.update_trackers.ack_map.set_max(*node, opt.version) {
|
||||
println!("Increased the ACK tracker for node {:?}", node);
|
||||
did_something = true;
|
||||
}
|
||||
|
||||
if opt.allow_missing_data {
|
||||
if layout.update_trackers.sync_map.set_max(*node, opt.version) {
|
||||
println!("Increased the SYNC tracker for node {:?}", node);
|
||||
did_something = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if did_something {
|
||||
send_layout(rpc_cli, rpc_host, layout).await?;
|
||||
println!("Success.");
|
||||
Ok(())
|
||||
} else if !opt.allow_missing_data {
|
||||
Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into()))
|
||||
} else {
|
||||
Err(Error::Message(
|
||||
"Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// --- utility ---
|
||||
|
||||
pub async fn fetch_layout(
|
||||
rpc_cli: &Endpoint<SystemRpc, ()>,
|
||||
rpc_host: NodeID,
|
||||
) -> Result<ClusterLayout, Error> {
|
||||
) -> Result<LayoutHistory, Error> {
|
||||
match rpc_cli
|
||||
.call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL)
|
||||
.await??
|
||||
{
|
||||
SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
|
||||
resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
|
||||
resp => Err(Error::unexpected_rpc_message(resp)),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_layout(
|
||||
rpc_cli: &Endpoint<SystemRpc, ()>,
|
||||
rpc_host: NodeID,
|
||||
layout: ClusterLayout,
|
||||
layout: LayoutHistory,
|
||||
) -> Result<(), Error> {
|
||||
rpc_cli
|
||||
.call(
|
||||
|
@ -327,7 +486,7 @@ pub async fn send_layout(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
|
||||
pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) {
|
||||
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
|
||||
for (id, _, role) in layout.roles.items().iter() {
|
||||
let role = match &role.0 {
|
||||
|
@ -366,21 +525,22 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
|
||||
let has_role_changes = layout
|
||||
.staging_roles
|
||||
pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool {
|
||||
let staging = layout.staging.get();
|
||||
let has_role_changes = staging
|
||||
.roles
|
||||
.items()
|
||||
.iter()
|
||||
.any(|(k, _, v)| layout.roles.get(k) != Some(v));
|
||||
let has_layout_changes = *layout.staging_parameters.get() != layout.parameters;
|
||||
.any(|(k, _, v)| layout.current().roles.get(k) != Some(v));
|
||||
let has_layout_changes = *staging.parameters.get() != layout.current().parameters;
|
||||
|
||||
if has_role_changes || has_layout_changes {
|
||||
println!();
|
||||
println!("==== STAGED ROLE CHANGES ====");
|
||||
if has_role_changes {
|
||||
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
|
||||
for (id, _, role) in layout.staging_roles.items().iter() {
|
||||
if layout.roles.get(id) == Some(role) {
|
||||
for (id, _, role) in staging.roles.items().iter() {
|
||||
if layout.current().roles.get(id) == Some(role) {
|
||||
continue;
|
||||
}
|
||||
if let Some(role) = &role.0 {
|
||||
|
@ -402,7 +562,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
|
|||
if has_layout_changes {
|
||||
println!(
|
||||
"Zone redundancy: {}",
|
||||
layout.staging_parameters.get().zone_redundancy
|
||||
staging.parameters.get().zone_redundancy
|
||||
);
|
||||
}
|
||||
true
|
||||
|
|
|
@ -31,11 +31,6 @@ pub enum Command {
|
|||
#[structopt(name = "key", version = garage_version())]
|
||||
Key(KeyOperation),
|
||||
|
||||
/// Run migrations from previous Garage version
|
||||
/// (DO NOT USE WITHOUT READING FULL DOCUMENTATION)
|
||||
#[structopt(name = "migrate", version = garage_version())]
|
||||
Migrate(MigrateOpt),
|
||||
|
||||
/// Start repair of node data on remote node
|
||||
#[structopt(name = "repair", version = garage_version())]
|
||||
Repair(RepairOpt),
|
||||
|
@ -118,6 +113,14 @@ pub enum LayoutOperation {
|
|||
/// Revert staged changes to cluster layout
|
||||
#[structopt(name = "revert", version = garage_version())]
|
||||
Revert(RevertLayoutOpt),
|
||||
|
||||
/// View the history of layouts in the cluster
|
||||
#[structopt(name = "history", version = garage_version())]
|
||||
History,
|
||||
|
||||
/// Skip dead nodes when awaiting for a new layout version to be synchronized
|
||||
#[structopt(name = "skip-dead-nodes", version = garage_version())]
|
||||
SkipDeadNodes(SkipDeadNodesOpt),
|
||||
}
|
||||
|
||||
#[derive(StructOpt, Debug)]
|
||||
|
@ -170,9 +173,21 @@ pub struct ApplyLayoutOpt {
|
|||
|
||||
#[derive(StructOpt, Debug)]
|
||||
pub struct RevertLayoutOpt {
|
||||
/// Version number of old configuration to which to revert
|
||||
/// The revert operation will not be ran unless this flag is added
|
||||
#[structopt(long = "yes")]
|
||||
pub(crate) yes: bool,
|
||||
}
|
||||
|
||||
#[derive(StructOpt, Debug)]
|
||||
pub struct SkipDeadNodesOpt {
|
||||
/// Version number of the layout to assume is currently up-to-date.
|
||||
/// This will generally be the current layout version.
|
||||
#[structopt(long = "version")]
|
||||
pub(crate) version: Option<u64>,
|
||||
pub(crate) version: u64,
|
||||
/// Allow the skip even if a quorum of ndoes could not be found for
|
||||
/// the data among the remaining nodes
|
||||
#[structopt(long = "allow-missing-data")]
|
||||
pub(crate) allow_missing_data: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug)]
|
||||
|
@ -429,23 +444,6 @@ pub struct KeyImportOpt {
|
|||
pub yes: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
|
||||
pub struct MigrateOpt {
|
||||
/// Confirm the launch of the migrate operation
|
||||
#[structopt(long = "yes")]
|
||||
pub yes: bool,
|
||||
|
||||
#[structopt(subcommand)]
|
||||
pub what: MigrateWhat,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
|
||||
pub enum MigrateWhat {
|
||||
/// Migrate buckets and permissions from v0.5.0
|
||||
#[structopt(name = "buckets050", version = garage_version())]
|
||||
Buckets050,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
|
||||
pub struct RepairOpt {
|
||||
/// Launch repair operation on all nodes
|
||||
|
@ -475,8 +473,11 @@ pub enum RepairWhat {
|
|||
#[structopt(name = "mpu", version = garage_version())]
|
||||
MultipartUploads,
|
||||
/// Repropagate version deletions to the block ref table
|
||||
#[structopt(name = "block_refs", version = garage_version())]
|
||||
#[structopt(name = "block-refs", version = garage_version())]
|
||||
BlockRefs,
|
||||
/// Recalculate block reference counters
|
||||
#[structopt(name = "block-rc", version = garage_version())]
|
||||
BlockRc,
|
||||
/// Verify integrity of all blocks on disc
|
||||
#[structopt(name = "scrub", version = garage_version())]
|
||||
Scrub {
|
||||
|
@ -537,10 +538,6 @@ pub struct StatsOpt {
|
|||
#[structopt(short = "a", long = "all-nodes")]
|
||||
pub all_nodes: bool,
|
||||
|
||||
/// Gather detailed statistics (this can be long)
|
||||
#[structopt(short = "d", long = "detailed")]
|
||||
pub detailed: bool,
|
||||
|
||||
/// Don't show global cluster stats (internal use in RPC)
|
||||
#[structopt(skip)]
|
||||
#[serde(default)]
|
||||
|
|
|
@ -450,6 +450,8 @@ pub fn print_block_info(
|
|||
|
||||
if refcount != nondeleted_count {
|
||||
println!();
|
||||
println!("Warning: refcount does not match number of non-deleted versions");
|
||||
println!(
|
||||
"Warning: refcount does not match number of non-deleted versions, you should try `garage repair block-rc`."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,8 +18,8 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled
|
|||
#[cfg(all(feature = "bundled-libs", feature = "system-libs"))]
|
||||
compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled");
|
||||
|
||||
#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))]
|
||||
compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite.");
|
||||
#[cfg(not(any(feature = "lmdb", feature = "sqlite")))]
|
||||
compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb or sqlite.");
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
|
@ -72,8 +72,6 @@ async fn main() {
|
|||
let features = &[
|
||||
#[cfg(feature = "k2v")]
|
||||
"k2v",
|
||||
#[cfg(feature = "sled")]
|
||||
"sled",
|
||||
#[cfg(feature = "lmdb")]
|
||||
"lmdb",
|
||||
#[cfg(feature = "sqlite")]
|
||||
|
|
|
@ -4,6 +4,7 @@ use std::time::Duration;
|
|||
use async_trait::async_trait;
|
||||
use tokio::sync::watch;
|
||||
|
||||
use garage_block::manager::BlockManager;
|
||||
use garage_block::repair::ScrubWorkerCommand;
|
||||
|
||||
use garage_model::garage::Garage;
|
||||
|
@ -16,11 +17,14 @@ use garage_table::replication::*;
|
|||
use garage_table::*;
|
||||
|
||||
use garage_util::background::*;
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::Error;
|
||||
use garage_util::migrate::Migrate;
|
||||
|
||||
use crate::*;
|
||||
|
||||
const RC_REPAIR_ITER_COUNT: usize = 64;
|
||||
|
||||
pub async fn launch_online_repair(
|
||||
garage: &Arc<Garage>,
|
||||
bg: &BackgroundRunner,
|
||||
|
@ -47,6 +51,13 @@ pub async fn launch_online_repair(
|
|||
info!("Repairing the block refs table");
|
||||
bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairBlockRefs));
|
||||
}
|
||||
RepairWhat::BlockRc => {
|
||||
info!("Repairing the block reference counters");
|
||||
bg.spawn_worker(BlockRcRepair::new(
|
||||
garage.block_manager.clone(),
|
||||
garage.block_ref_table.clone(),
|
||||
));
|
||||
}
|
||||
RepairWhat::Blocks => {
|
||||
info!("Repairing the stored blocks");
|
||||
bg.spawn_worker(garage_block::repair::RepairWorker::new(
|
||||
|
@ -282,3 +293,98 @@ impl TableRepair for RepairMpu {
|
|||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
// ===== block reference counter repair =====
|
||||
|
||||
pub struct BlockRcRepair {
|
||||
block_manager: Arc<BlockManager>,
|
||||
block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
|
||||
cursor: Hash,
|
||||
counter: u64,
|
||||
repairs: u64,
|
||||
}
|
||||
|
||||
impl BlockRcRepair {
|
||||
fn new(
|
||||
block_manager: Arc<BlockManager>,
|
||||
block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
block_manager,
|
||||
block_ref_table,
|
||||
cursor: [0u8; 32].into(),
|
||||
counter: 0,
|
||||
repairs: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Worker for BlockRcRepair {
|
||||
fn name(&self) -> String {
|
||||
format!("Block refcount repair worker")
|
||||
}
|
||||
|
||||
fn status(&self) -> WorkerStatus {
|
||||
WorkerStatus {
|
||||
progress: Some(format!("{} ({})", self.counter, self.repairs)),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
||||
for _i in 0..RC_REPAIR_ITER_COUNT {
|
||||
let next1 = self
|
||||
.block_manager
|
||||
.rc
|
||||
.rc_table
|
||||
.range(self.cursor.as_slice()..)?
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|(k, _)| Hash::try_from(k.as_slice()).unwrap());
|
||||
let next2 = self
|
||||
.block_ref_table
|
||||
.data
|
||||
.store
|
||||
.range(self.cursor.as_slice()..)?
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|(k, _)| Hash::try_from(&k[..32]).unwrap());
|
||||
let next = match (next1, next2) {
|
||||
(Some(k1), Some(k2)) => std::cmp::min(k1, k2),
|
||||
(Some(k), None) | (None, Some(k)) => k,
|
||||
(None, None) => {
|
||||
info!(
|
||||
"{}: finished, done {}, fixed {}",
|
||||
self.name(),
|
||||
self.counter,
|
||||
self.repairs
|
||||
);
|
||||
return Ok(WorkerState::Done);
|
||||
}
|
||||
};
|
||||
|
||||
if self.block_manager.rc.recalculate_rc(&next)?.1 {
|
||||
self.repairs += 1;
|
||||
}
|
||||
self.counter += 1;
|
||||
if let Some(next_incr) = next.increment() {
|
||||
self.cursor = next_incr;
|
||||
} else {
|
||||
info!(
|
||||
"{}: finished, done {}, fixed {}",
|
||||
self.name(),
|
||||
self.counter,
|
||||
self.repairs
|
||||
);
|
||||
return Ok(WorkerState::Done);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(WorkerState::Busy)
|
||||
}
|
||||
|
||||
async fn wait_for_work(&mut self) -> WorkerState {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -163,7 +163,7 @@ mod tests {
|
|||
r#"
|
||||
metadata_dir = "/tmp/garage/meta"
|
||||
data_dir = "/tmp/garage/data"
|
||||
replication_mode = "3"
|
||||
replication_factor = 3
|
||||
rpc_bind_addr = "[::]:3901"
|
||||
rpc_secret_file = "{}"
|
||||
|
||||
|
@ -185,7 +185,7 @@ mod tests {
|
|||
r#"
|
||||
metadata_dir = "/tmp/garage/meta"
|
||||
data_dir = "/tmp/garage/data"
|
||||
replication_mode = "3"
|
||||
replication_factor = 3
|
||||
rpc_bind_addr = "[::]:3901"
|
||||
rpc_secret_file = "{}"
|
||||
allow_world_readable_secrets = true
|
||||
|
@ -296,7 +296,7 @@ mod tests {
|
|||
r#"
|
||||
metadata_dir = "/tmp/garage/meta"
|
||||
data_dir = "/tmp/garage/data"
|
||||
replication_mode = "3"
|
||||
replication_factor = 3
|
||||
rpc_bind_addr = "[::]:3901"
|
||||
rpc_secret= "dummy"
|
||||
rpc_secret_file = "dummy"
|
||||
|
|
|
@ -14,42 +14,20 @@ impl CommandExt for process::Command {
|
|||
}
|
||||
|
||||
fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus {
|
||||
let status = self.status().expect(msg);
|
||||
status.expect_success(msg);
|
||||
status
|
||||
self.expect_success_output(msg).status
|
||||
}
|
||||
fn expect_success_output(&mut self, msg: &str) -> process::Output {
|
||||
let output = self.output().expect(msg);
|
||||
output.expect_success(msg);
|
||||
if !output.status.success() {
|
||||
panic!(
|
||||
"{}: command {:?} exited with error {:?}\nSTDOUT: {}\nSTDERR: {}",
|
||||
msg,
|
||||
self,
|
||||
output.status.code(),
|
||||
String::from_utf8_lossy(&output.stdout),
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
pub trait OutputExt {
|
||||
fn expect_success(&self, msg: &str);
|
||||
}
|
||||
|
||||
impl OutputExt for process::Output {
|
||||
fn expect_success(&self, msg: &str) {
|
||||
self.status.expect_success(msg)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait ExitStatusExt {
|
||||
fn expect_success(&self, msg: &str);
|
||||
}
|
||||
|
||||
impl ExitStatusExt for process::ExitStatus {
|
||||
fn expect_success(&self, msg: &str) {
|
||||
if !self.success() {
|
||||
match self.code() {
|
||||
Some(code) => panic!(
|
||||
"Command exited with code {code}: {msg}",
|
||||
code = code,
|
||||
msg = msg
|
||||
),
|
||||
None => panic!("Command exited with signal: {msg}", msg = msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ metadata_dir = "{path}/meta"
|
|||
data_dir = "{path}/data"
|
||||
db_engine = "{db_engine}"
|
||||
|
||||
replication_mode = "1"
|
||||
replication_factor = 1
|
||||
|
||||
rpc_bind_addr = "127.0.0.1:{rpc_port}"
|
||||
rpc_public_addr = "127.0.0.1:{rpc_port}"
|
||||
|
@ -100,7 +100,7 @@ api_bind_addr = "127.0.0.1:{admin_port}"
|
|||
.arg("server")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr)
|
||||
.env("RUST_LOG", "garage=info,garage_api=trace")
|
||||
.env("RUST_LOG", "garage=debug,garage_api=trace")
|
||||
.spawn()
|
||||
.expect("Could not start garage");
|
||||
|
||||
|
|
|
@ -3,5 +3,6 @@ mod multipart;
|
|||
mod objects;
|
||||
mod presigned;
|
||||
mod simple;
|
||||
mod ssec;
|
||||
mod streaming_signature;
|
||||
mod website;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use crate::common;
|
||||
use aws_sdk_s3::primitives::ByteStream;
|
||||
use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart};
|
||||
use aws_sdk_s3::types::{ChecksumAlgorithm, CompletedMultipartUpload, CompletedPart};
|
||||
use base64::prelude::*;
|
||||
|
||||
const SZ_5MB: usize = 5 * 1024 * 1024;
|
||||
const SZ_10MB: usize = 10 * 1024 * 1024;
|
||||
|
@ -189,6 +190,153 @@ async fn test_multipart_upload() {
|
|||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multipart_with_checksum() {
|
||||
let ctx = common::context();
|
||||
let bucket = ctx.create_bucket("testmpu-cksum");
|
||||
|
||||
let u1 = vec![0x11; SZ_5MB];
|
||||
let u2 = vec![0x22; SZ_5MB];
|
||||
let u3 = vec![0x33; SZ_5MB];
|
||||
|
||||
let ck1 = calculate_sha1(&u1);
|
||||
let ck2 = calculate_sha1(&u2);
|
||||
let ck3 = calculate_sha1(&u3);
|
||||
|
||||
let up = ctx
|
||||
.client
|
||||
.create_multipart_upload()
|
||||
.bucket(&bucket)
|
||||
.checksum_algorithm(ChecksumAlgorithm::Sha1)
|
||||
.key("a")
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(up.upload_id.is_some());
|
||||
|
||||
let uid = up.upload_id.as_ref().unwrap();
|
||||
|
||||
let p1 = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.part_number(1)
|
||||
.checksum_sha1(&ck1)
|
||||
.body(ByteStream::from(u1.clone()))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// wrong checksum value should return an error
|
||||
let err1 = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.part_number(2)
|
||||
.checksum_sha1(&ck1)
|
||||
.body(ByteStream::from(u2.clone()))
|
||||
.send()
|
||||
.await;
|
||||
assert!(err1.is_err());
|
||||
|
||||
let p2 = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.part_number(2)
|
||||
.checksum_sha1(&ck2)
|
||||
.body(ByteStream::from(u2))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let p3 = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.part_number(3)
|
||||
.checksum_sha1(&ck3)
|
||||
.body(ByteStream::from(u3.clone()))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
let r = ctx
|
||||
.client
|
||||
.list_parts()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
let parts = r.parts.unwrap();
|
||||
assert_eq!(parts.len(), 3);
|
||||
assert!(parts[0].checksum_crc32.is_none());
|
||||
assert!(parts[0].checksum_crc32_c.is_none());
|
||||
assert!(parts[0].checksum_sha256.is_none());
|
||||
assert_eq!(parts[0].checksum_sha1.as_deref().unwrap(), ck1);
|
||||
assert_eq!(parts[1].checksum_sha1.as_deref().unwrap(), ck2);
|
||||
assert_eq!(parts[2].checksum_sha1.as_deref().unwrap(), ck3);
|
||||
}
|
||||
|
||||
let cmp = CompletedMultipartUpload::builder()
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(1)
|
||||
.checksum_sha1(&ck1)
|
||||
.e_tag(p1.e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(2)
|
||||
.checksum_sha1(&ck2)
|
||||
.e_tag(p2.e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(3)
|
||||
.checksum_sha1(&ck3)
|
||||
.e_tag(p3.e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.build();
|
||||
|
||||
let expected_checksum = calculate_sha1(
|
||||
&vec![
|
||||
BASE64_STANDARD.decode(&ck1).unwrap(),
|
||||
BASE64_STANDARD.decode(&ck2).unwrap(),
|
||||
BASE64_STANDARD.decode(&ck3).unwrap(),
|
||||
]
|
||||
.concat(),
|
||||
);
|
||||
|
||||
let res = ctx
|
||||
.client
|
||||
.complete_multipart_upload()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.checksum_sha1(expected_checksum.clone())
|
||||
.multipart_upload(cmp)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.checksum_sha1, Some(expected_checksum));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_uploadlistpart() {
|
||||
let ctx = common::context();
|
||||
|
@ -624,3 +772,11 @@ async fn test_uploadpartcopy() {
|
|||
assert_eq!(real_obj.len(), exp_obj.len());
|
||||
assert_eq!(real_obj, exp_obj);
|
||||
}
|
||||
|
||||
fn calculate_sha1(bytes: &[u8]) -> String {
|
||||
use sha1::{Digest, Sha1};
|
||||
|
||||
let mut hasher = Sha1::new();
|
||||
hasher.update(bytes);
|
||||
BASE64_STANDARD.encode(&hasher.finalize()[..])
|
||||
}
|
||||
|
|
455
src/garage/tests/s3/ssec.rs
Normal file
455
src/garage/tests/s3/ssec.rs
Normal file
|
@ -0,0 +1,455 @@
|
|||
use crate::common::{self, Context};
|
||||
use aws_sdk_s3::primitives::ByteStream;
|
||||
use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart};
|
||||
|
||||
const SSEC_KEY: &str = "u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y=";
|
||||
const SSEC_KEY_MD5: &str = "jMGbs3GyZkYjJUP6q5jA7g==";
|
||||
const SSEC_KEY2: &str = "XkYVk4Z3vVDO2yJaUqCAEZX6lL10voMxtV06d8my/eU=";
|
||||
const SSEC_KEY2_MD5: &str = "kedo2ab8J1MCjHwJuLTJHw==";
|
||||
|
||||
const SZ_2MB: usize = 2 * 1024 * 1024;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ssec_object() {
|
||||
let ctx = common::context();
|
||||
let bucket = ctx.create_bucket("sse-c");
|
||||
|
||||
let bytes1 = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".to_vec();
|
||||
let bytes2 = (0..400000)
|
||||
.map(|x| ((x * 3792) % 256) as u8)
|
||||
.collect::<Vec<u8>>();
|
||||
|
||||
for data in vec![bytes1, bytes2] {
|
||||
let stream = ByteStream::new(data.clone().into());
|
||||
|
||||
// Write encrypted object
|
||||
let r = ctx
|
||||
.client
|
||||
.put_object()
|
||||
.bucket(&bucket)
|
||||
.key("testobj")
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY)
|
||||
.sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.body(stream)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
|
||||
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into()));
|
||||
|
||||
test_read_encrypted(
|
||||
&ctx,
|
||||
&bucket,
|
||||
"testobj",
|
||||
&data,
|
||||
SSEC_KEY,
|
||||
SSEC_KEY_MD5,
|
||||
SSEC_KEY2,
|
||||
SSEC_KEY2_MD5,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Test copy from encrypted to non-encrypted
|
||||
let r = ctx
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(&bucket)
|
||||
.key("test-copy-enc-dec")
|
||||
.copy_source(format!("{}/{}", bucket, "testobj"))
|
||||
.copy_source_sse_customer_algorithm("AES256")
|
||||
.copy_source_sse_customer_key(SSEC_KEY)
|
||||
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(r.sse_customer_algorithm, None);
|
||||
assert_eq!(r.sse_customer_key_md5, None);
|
||||
|
||||
// Test read decrypted file
|
||||
let r = ctx
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(&bucket)
|
||||
.key("test-copy-enc-dec")
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_bytes_eq!(r.body, &data);
|
||||
assert_eq!(r.sse_customer_algorithm, None);
|
||||
assert_eq!(r.sse_customer_key_md5, None);
|
||||
|
||||
// Test copy from non-encrypted to encrypted
|
||||
let r = ctx
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(&bucket)
|
||||
.key("test-copy-enc-dec-enc")
|
||||
.copy_source(format!("{}/test-copy-enc-dec", bucket))
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
|
||||
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into()));
|
||||
|
||||
test_read_encrypted(
|
||||
&ctx,
|
||||
&bucket,
|
||||
"test-copy-enc-dec-enc",
|
||||
&data,
|
||||
SSEC_KEY2,
|
||||
SSEC_KEY2_MD5,
|
||||
SSEC_KEY,
|
||||
SSEC_KEY_MD5,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Test copy from encrypted to encrypted with different keys
|
||||
let r = ctx
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(&bucket)
|
||||
.key("test-copy-enc-enc")
|
||||
.copy_source(format!("{}/{}", bucket, "testobj"))
|
||||
.copy_source_sse_customer_algorithm("AES256")
|
||||
.copy_source_sse_customer_key(SSEC_KEY)
|
||||
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
|
||||
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into()));
|
||||
test_read_encrypted(
|
||||
&ctx,
|
||||
&bucket,
|
||||
"test-copy-enc-enc",
|
||||
&data,
|
||||
SSEC_KEY2,
|
||||
SSEC_KEY2_MD5,
|
||||
SSEC_KEY,
|
||||
SSEC_KEY_MD5,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Test copy from encrypted to encrypted with the same key
|
||||
let r = ctx
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(&bucket)
|
||||
.key("test-copy-enc-enc-same")
|
||||
.copy_source(format!("{}/{}", bucket, "testobj"))
|
||||
.copy_source_sse_customer_algorithm("AES256")
|
||||
.copy_source_sse_customer_key(SSEC_KEY)
|
||||
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY)
|
||||
.sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
|
||||
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into()));
|
||||
test_read_encrypted(
|
||||
&ctx,
|
||||
&bucket,
|
||||
"test-copy-enc-enc-same",
|
||||
&data,
|
||||
SSEC_KEY,
|
||||
SSEC_KEY_MD5,
|
||||
SSEC_KEY2,
|
||||
SSEC_KEY2_MD5,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multipart_upload() {
|
||||
let ctx = common::context();
|
||||
let bucket = ctx.create_bucket("test-ssec-mpu");
|
||||
|
||||
let u1 = vec![0x11; SZ_2MB];
|
||||
let u2 = vec![0x22; SZ_2MB];
|
||||
let u3 = vec![0x33; SZ_2MB];
|
||||
let all = [&u1[..], &u2[..], &u3[..]].concat();
|
||||
|
||||
// Test simple encrypted mpu
|
||||
{
|
||||
let up = ctx
|
||||
.client
|
||||
.create_multipart_upload()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY)
|
||||
.sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(up.upload_id.is_some());
|
||||
assert_eq!(up.sse_customer_algorithm, Some("AES256".into()));
|
||||
assert_eq!(up.sse_customer_key_md5, Some(SSEC_KEY_MD5.into()));
|
||||
|
||||
let uid = up.upload_id.as_ref().unwrap();
|
||||
|
||||
let mut etags = vec![];
|
||||
for (i, part) in vec![&u1, &u2, &u3].into_iter().enumerate() {
|
||||
let pu = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.part_number((i + 1) as i32)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY)
|
||||
.sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.body(ByteStream::from(part.to_vec()))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
etags.push(pu.e_tag.unwrap());
|
||||
}
|
||||
|
||||
let mut cmp = CompletedMultipartUpload::builder();
|
||||
for (i, etag) in etags.into_iter().enumerate() {
|
||||
cmp = cmp.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number((i + 1) as i32)
|
||||
.e_tag(etag)
|
||||
.build(),
|
||||
);
|
||||
}
|
||||
|
||||
ctx.client
|
||||
.complete_multipart_upload()
|
||||
.bucket(&bucket)
|
||||
.key("a")
|
||||
.upload_id(uid)
|
||||
.multipart_upload(cmp.build())
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
test_read_encrypted(
|
||||
&ctx,
|
||||
&bucket,
|
||||
"a",
|
||||
&all,
|
||||
SSEC_KEY,
|
||||
SSEC_KEY_MD5,
|
||||
SSEC_KEY2,
|
||||
SSEC_KEY2_MD5,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// Test upload part copy from first object
|
||||
{
|
||||
// (setup) Upload a single part object
|
||||
ctx.client
|
||||
.put_object()
|
||||
.bucket(&bucket)
|
||||
.key("b")
|
||||
.body(ByteStream::from(u1.clone()))
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let up = ctx
|
||||
.client
|
||||
.create_multipart_upload()
|
||||
.bucket(&bucket)
|
||||
.key("target")
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
let uid = up.upload_id.as_ref().unwrap();
|
||||
|
||||
let p1 = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("target")
|
||||
.upload_id(uid)
|
||||
.part_number(1)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.body(ByteStream::from(u3.clone()))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let p2 = ctx
|
||||
.client
|
||||
.upload_part_copy()
|
||||
.bucket(&bucket)
|
||||
.key("target")
|
||||
.upload_id(uid)
|
||||
.part_number(2)
|
||||
.copy_source(format!("{}/a", bucket))
|
||||
.copy_source_range("bytes=500-550000")
|
||||
.copy_source_sse_customer_algorithm("AES256")
|
||||
.copy_source_sse_customer_key(SSEC_KEY)
|
||||
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let p3 = ctx
|
||||
.client
|
||||
.upload_part()
|
||||
.bucket(&bucket)
|
||||
.key("target")
|
||||
.upload_id(uid)
|
||||
.part_number(3)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.body(ByteStream::from(u2.clone()))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let p4 = ctx
|
||||
.client
|
||||
.upload_part_copy()
|
||||
.bucket(&bucket)
|
||||
.key("target")
|
||||
.upload_id(uid)
|
||||
.part_number(4)
|
||||
.copy_source(format!("{}/b", bucket))
|
||||
.copy_source_range("bytes=1500-20500")
|
||||
.copy_source_sse_customer_algorithm("AES256")
|
||||
.copy_source_sse_customer_key(SSEC_KEY2)
|
||||
.copy_source_sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(SSEC_KEY2)
|
||||
.sse_customer_key_md5(SSEC_KEY2_MD5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cmp = CompletedMultipartUpload::builder()
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(1)
|
||||
.e_tag(p1.e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(2)
|
||||
.e_tag(p2.copy_part_result.unwrap().e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(3)
|
||||
.e_tag(p3.e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.parts(
|
||||
CompletedPart::builder()
|
||||
.part_number(4)
|
||||
.e_tag(p4.copy_part_result.unwrap().e_tag.unwrap())
|
||||
.build(),
|
||||
)
|
||||
.build();
|
||||
|
||||
ctx.client
|
||||
.complete_multipart_upload()
|
||||
.bucket(&bucket)
|
||||
.key("target")
|
||||
.upload_id(uid)
|
||||
.multipart_upload(cmp)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// (check) Get object
|
||||
let expected = [&u3[..], &all[500..550001], &u2[..], &u1[1500..20501]].concat();
|
||||
test_read_encrypted(
|
||||
&ctx,
|
||||
&bucket,
|
||||
"target",
|
||||
&expected,
|
||||
SSEC_KEY2,
|
||||
SSEC_KEY2_MD5,
|
||||
SSEC_KEY,
|
||||
SSEC_KEY_MD5,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn test_read_encrypted(
|
||||
ctx: &Context,
|
||||
bucket: &str,
|
||||
obj_key: &str,
|
||||
expected_data: &[u8],
|
||||
enc_key: &str,
|
||||
enc_key_md5: &str,
|
||||
wrong_enc_key: &str,
|
||||
wrong_enc_key_md5: &str,
|
||||
) {
|
||||
// Test read encrypted without key
|
||||
let o = ctx
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(bucket)
|
||||
.key(obj_key)
|
||||
.send()
|
||||
.await;
|
||||
assert!(
|
||||
o.is_err(),
|
||||
"encrypted file could be read without encryption key"
|
||||
);
|
||||
|
||||
// Test read encrypted with wrong key
|
||||
let o = ctx
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(bucket)
|
||||
.key(obj_key)
|
||||
.sse_customer_key(wrong_enc_key)
|
||||
.sse_customer_key_md5(wrong_enc_key_md5)
|
||||
.send()
|
||||
.await;
|
||||
assert!(
|
||||
o.is_err(),
|
||||
"encrypted file could be read with incorrect encryption key"
|
||||
);
|
||||
|
||||
// Test read encrypted with correct key
|
||||
let o = ctx
|
||||
.client
|
||||
.get_object()
|
||||
.bucket(bucket)
|
||||
.key(obj_key)
|
||||
.sse_customer_algorithm("AES256")
|
||||
.sse_customer_key(enc_key)
|
||||
.sse_customer_key_md5(enc_key_md5)
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_bytes_eq!(o.body, expected_data);
|
||||
assert_eq!(o.sse_customer_algorithm, Some("AES256".into()));
|
||||
assert_eq!(o.sse_customer_key_md5, Some(enc_key_md5.to_string()));
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage_model"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
@ -27,6 +27,7 @@ blake2.workspace = true
|
|||
chrono.workspace = true
|
||||
err-derive.workspace = true
|
||||
hex.workspace = true
|
||||
http.workspace = true
|
||||
base64.workspace = true
|
||||
parse_duration.workspace = true
|
||||
tracing.workspace = true
|
||||
|
@ -42,8 +43,7 @@ tokio.workspace = true
|
|||
opentelemetry.workspace = true
|
||||
|
||||
[features]
|
||||
default = [ "sled", "lmdb", "sqlite" ]
|
||||
default = [ "lmdb", "sqlite" ]
|
||||
k2v = [ "garage_util/k2v" ]
|
||||
lmdb = [ "garage_db/lmdb" ]
|
||||
sled = [ "garage_db/sled" ]
|
||||
sqlite = [ "garage_db/sqlite" ]
|
||||
|
|
|
@ -10,7 +10,7 @@ use garage_util::config::*;
|
|||
use garage_util::error::*;
|
||||
use garage_util::persister::PersisterShared;
|
||||
|
||||
use garage_rpc::replication_mode::ReplicationMode;
|
||||
use garage_rpc::replication_mode::*;
|
||||
use garage_rpc::system::System;
|
||||
|
||||
use garage_block::manager::*;
|
||||
|
@ -40,8 +40,8 @@ pub struct Garage {
|
|||
/// The set of background variables that can be viewed/modified at runtime
|
||||
pub bg_vars: vars::BgVars,
|
||||
|
||||
/// The replication mode of this cluster
|
||||
pub replication_mode: ReplicationMode,
|
||||
/// The replication factor of this cluster
|
||||
pub replication_factor: ReplicationFactor,
|
||||
|
||||
/// The local database
|
||||
pub db: db::Db,
|
||||
|
@ -118,9 +118,6 @@ impl Garage {
|
|||
.ok_or_message("Invalid `db_engine` value in configuration file")?;
|
||||
let mut db_path = config.metadata_dir.clone();
|
||||
match db_engine {
|
||||
db::Engine::Sled => {
|
||||
db_path.push("db");
|
||||
}
|
||||
db::Engine::Sqlite => {
|
||||
db_path.push("db.sqlite");
|
||||
}
|
||||
|
@ -134,8 +131,6 @@ impl Garage {
|
|||
v if v == usize::default() => None,
|
||||
v => Some(v),
|
||||
},
|
||||
sled_cache_capacity: config.sled_cache_capacity,
|
||||
sled_flush_every_ms: config.sled_flush_every_ms,
|
||||
};
|
||||
let db = db::open_db(&db_path, db_engine, &db_opt)
|
||||
.ok_or_message("Unable to open metadata db")?;
|
||||
|
@ -148,32 +143,30 @@ impl Garage {
|
|||
.and_then(|x| NetworkKey::from_slice(&x))
|
||||
.ok_or_message("Invalid RPC secret key")?;
|
||||
|
||||
let replication_mode = ReplicationMode::parse(&config.replication_mode)
|
||||
.ok_or_message("Invalid replication_mode in config file.")?;
|
||||
let (replication_factor, consistency_mode) = parse_replication_mode(&config)?;
|
||||
|
||||
info!("Initialize background variable system...");
|
||||
let mut bg_vars = vars::BgVars::new();
|
||||
|
||||
info!("Initialize membership management system...");
|
||||
let system = System::new(network_key, replication_mode, &config)?;
|
||||
let system = System::new(network_key, replication_factor, consistency_mode, &config)?;
|
||||
|
||||
let data_rep_param = TableShardedReplication {
|
||||
system: system.clone(),
|
||||
replication_factor: replication_mode.replication_factor(),
|
||||
write_quorum: replication_mode.write_quorum(),
|
||||
replication_factor: replication_factor.into(),
|
||||
write_quorum: replication_factor.write_quorum(consistency_mode),
|
||||
read_quorum: 1,
|
||||
};
|
||||
|
||||
let meta_rep_param = TableShardedReplication {
|
||||
system: system.clone(),
|
||||
replication_factor: replication_mode.replication_factor(),
|
||||
write_quorum: replication_mode.write_quorum(),
|
||||
read_quorum: replication_mode.read_quorum(),
|
||||
replication_factor: replication_factor.into(),
|
||||
write_quorum: replication_factor.write_quorum(consistency_mode),
|
||||
read_quorum: replication_factor.read_quorum(consistency_mode),
|
||||
};
|
||||
|
||||
let control_rep_param = TableFullReplication {
|
||||
system: system.clone(),
|
||||
max_faults: replication_mode.control_write_max_faults(),
|
||||
};
|
||||
|
||||
info!("Initialize block manager...");
|
||||
|
@ -254,11 +247,19 @@ impl Garage {
|
|||
#[cfg(feature = "k2v")]
|
||||
let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param);
|
||||
|
||||
// ---- setup block refcount recalculation ----
|
||||
// this function can be used to fix inconsistencies in the RC table
|
||||
block_manager.set_recalc_rc(vec![
|
||||
block_ref_recount_fn(&block_ref_table),
|
||||
// other functions could be added here if we had other tables
|
||||
// that hold references to data blocks
|
||||
]);
|
||||
|
||||
// -- done --
|
||||
Ok(Arc::new(Self {
|
||||
config,
|
||||
bg_vars,
|
||||
replication_mode,
|
||||
replication_factor,
|
||||
db,
|
||||
system,
|
||||
block_manager,
|
||||
|
|
|
@ -155,10 +155,12 @@ impl<'a> BucketHelper<'a> {
|
|||
|
||||
#[cfg(feature = "k2v")]
|
||||
{
|
||||
use garage_rpc::ring::Ring;
|
||||
use std::sync::Arc;
|
||||
|
||||
let ring: Arc<Ring> = self.0.system.ring.borrow().clone();
|
||||
let node_id_vec = self
|
||||
.0
|
||||
.system
|
||||
.cluster_layout()
|
||||
.all_nongateway_nodes()
|
||||
.to_vec();
|
||||
let k2vindexes = self
|
||||
.0
|
||||
.k2v
|
||||
|
@ -167,7 +169,7 @@ impl<'a> BucketHelper<'a> {
|
|||
.get_range(
|
||||
&bucket_id,
|
||||
None,
|
||||
Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
|
||||
Some((DeletedFilter::NotDeleted, node_id_vec)),
|
||||
10,
|
||||
EnumerationOrder::Forward,
|
||||
)
|
||||
|
|
|
@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
|||
|
||||
use garage_db as db;
|
||||
|
||||
use garage_rpc::ring::Ring;
|
||||
use garage_rpc::layout::LayoutHelper;
|
||||
use garage_rpc::system::System;
|
||||
use garage_util::background::BackgroundRunner;
|
||||
use garage_util::data::*;
|
||||
|
@ -83,9 +83,9 @@ impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> {
|
|||
}
|
||||
|
||||
impl<T: CountedItem> CounterEntry<T> {
|
||||
pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> {
|
||||
let nodes = &ring.layout.node_id_vec[..];
|
||||
self.filtered_values_with_nodes(nodes)
|
||||
pub fn filtered_values(&self, layout: &LayoutHelper) -> HashMap<String, i64> {
|
||||
let nodes = layout.all_nongateway_nodes();
|
||||
self.filtered_values_with_nodes(&nodes)
|
||||
}
|
||||
|
||||
pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> {
|
||||
|
|
|
@ -127,23 +127,21 @@ impl K2VRpcHandler {
|
|||
.item_table
|
||||
.data
|
||||
.replication
|
||||
.write_nodes(&partition.hash());
|
||||
.storage_nodes(&partition.hash());
|
||||
who.sort();
|
||||
|
||||
self.system
|
||||
.rpc
|
||||
.rpc_helper()
|
||||
.try_call_many(
|
||||
&self.endpoint,
|
||||
&who[..],
|
||||
&who,
|
||||
K2VRpc::InsertItem(InsertedItem {
|
||||
partition,
|
||||
sort_key,
|
||||
causal_context,
|
||||
value,
|
||||
}),
|
||||
RequestStrategy::with_priority(PRIO_NORMAL)
|
||||
.with_quorum(1)
|
||||
.interrupt_after_quorum(true),
|
||||
RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1),
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
@ -168,7 +166,7 @@ impl K2VRpcHandler {
|
|||
.item_table
|
||||
.data
|
||||
.replication
|
||||
.write_nodes(&partition.hash());
|
||||
.storage_nodes(&partition.hash());
|
||||
who.sort();
|
||||
|
||||
call_list.entry(who).or_default().push(InsertedItem {
|
||||
|
@ -187,14 +185,12 @@ impl K2VRpcHandler {
|
|||
let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
|
||||
let resp = self
|
||||
.system
|
||||
.rpc
|
||||
.rpc_helper()
|
||||
.try_call_many(
|
||||
&self.endpoint,
|
||||
&nodes[..],
|
||||
K2VRpc::InsertManyItems(items),
|
||||
RequestStrategy::with_priority(PRIO_NORMAL)
|
||||
.with_quorum(1)
|
||||
.interrupt_after_quorum(true),
|
||||
RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1),
|
||||
)
|
||||
.await?;
|
||||
Ok::<_, Error>((nodes, resp))
|
||||
|
@ -227,11 +223,11 @@ impl K2VRpcHandler {
|
|||
.item_table
|
||||
.data
|
||||
.replication
|
||||
.write_nodes(&poll_key.partition.hash());
|
||||
.storage_nodes(&poll_key.partition.hash());
|
||||
|
||||
let rpc = self.system.rpc.try_call_many(
|
||||
let rpc = self.system.rpc_helper().try_call_many(
|
||||
&self.endpoint,
|
||||
&nodes[..],
|
||||
&nodes,
|
||||
K2VRpc::PollItem {
|
||||
key: poll_key,
|
||||
causal_context,
|
||||
|
@ -239,9 +235,10 @@ impl K2VRpcHandler {
|
|||
},
|
||||
RequestStrategy::with_priority(PRIO_NORMAL)
|
||||
.with_quorum(self.item_table.data.replication.read_quorum())
|
||||
.send_all_at_once(true)
|
||||
.without_timeout(),
|
||||
);
|
||||
let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
|
||||
let timeout_duration = Duration::from_millis(timeout_msec);
|
||||
let resps = select! {
|
||||
r = rpc => r?,
|
||||
_ = tokio::time::sleep(timeout_duration) => return Ok(None),
|
||||
|
@ -287,7 +284,7 @@ impl K2VRpcHandler {
|
|||
.item_table
|
||||
.data
|
||||
.replication
|
||||
.write_nodes(&range.partition.hash());
|
||||
.storage_nodes(&range.partition.hash());
|
||||
let quorum = self.item_table.data.replication.read_quorum();
|
||||
let msg = K2VRpc::PollRange {
|
||||
range,
|
||||
|
@ -302,7 +299,7 @@ impl K2VRpcHandler {
|
|||
.iter()
|
||||
.map(|node| {
|
||||
self.system
|
||||
.rpc
|
||||
.rpc_helper()
|
||||
.call(&self.endpoint, *node, msg.clone(), rs.clone())
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
@ -320,8 +317,7 @@ impl K2VRpcHandler {
|
|||
// kind: all items produced by that node until time ts have been returned, so we can
|
||||
// bump the entry in the global vector clock and possibly remove some item-specific
|
||||
// vector clocks)
|
||||
let mut deadline =
|
||||
Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
|
||||
let mut deadline = Instant::now() + Duration::from_millis(timeout_msec);
|
||||
let mut resps = vec![];
|
||||
let mut errors = vec![];
|
||||
loop {
|
||||
|
@ -343,7 +339,7 @@ impl K2VRpcHandler {
|
|||
}
|
||||
if errors.len() > nodes.len() - quorum {
|
||||
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
|
||||
return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into());
|
||||
return Err(Error::Quorum(quorum, None, resps.len(), nodes.len(), errors).into());
|
||||
}
|
||||
|
||||
// Take all returned items into account to produce the response.
|
||||
|
|
|
@ -7,48 +7,7 @@ use garage_table::{DeletedFilter, EmptyKey, Entry, TableSchema};
|
|||
|
||||
use crate::permission::BucketKeyPerm;
|
||||
|
||||
pub(crate) mod v05 {
|
||||
use garage_util::crdt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// An api key
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Key {
|
||||
/// The id of the key (immutable), used as partition key
|
||||
pub key_id: String,
|
||||
|
||||
/// The secret_key associated
|
||||
pub secret_key: String,
|
||||
|
||||
/// Name for the key
|
||||
pub name: crdt::Lww<String>,
|
||||
|
||||
/// Is the key deleted
|
||||
pub deleted: crdt::Bool,
|
||||
|
||||
/// Buckets in which the key is authorized. Empty if `Key` is deleted
|
||||
// CRDT interaction: deleted implies authorized_buckets is empty
|
||||
pub authorized_buckets: crdt::LwwMap<String, PermissionSet>,
|
||||
}
|
||||
|
||||
/// Permission given to a key in a bucket
|
||||
#[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PermissionSet {
|
||||
/// The key can be used to read the bucket
|
||||
pub allow_read: bool,
|
||||
/// The key can be used to write in the bucket
|
||||
pub allow_write: bool,
|
||||
}
|
||||
|
||||
impl crdt::AutoCrdt for PermissionSet {
|
||||
const WARN_IF_DIFFERENT: bool = true;
|
||||
}
|
||||
|
||||
impl garage_util::migrate::InitialFormat for Key {}
|
||||
}
|
||||
|
||||
mod v08 {
|
||||
use super::v05;
|
||||
use crate::permission::BucketKeyPerm;
|
||||
use garage_util::crdt;
|
||||
use garage_util::data::Uuid;
|
||||
|
@ -86,32 +45,7 @@ mod v08 {
|
|||
pub local_aliases: crdt::LwwMap<String, Option<Uuid>>,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::Migrate for Key {
|
||||
type Previous = v05::Key;
|
||||
|
||||
fn migrate(old_k: v05::Key) -> Key {
|
||||
let name = crdt::Lww::raw(old_k.name.timestamp(), old_k.name.get().clone());
|
||||
|
||||
let state = if old_k.deleted.get() {
|
||||
crdt::Deletable::Deleted
|
||||
} else {
|
||||
// Authorized buckets is ignored here,
|
||||
// migration is performed in specific migration code in
|
||||
// garage/migrate.rs
|
||||
crdt::Deletable::Present(KeyParams {
|
||||
secret_key: old_k.secret_key,
|
||||
name,
|
||||
allow_create_bucket: crdt::Lww::new(false),
|
||||
authorized_buckets: crdt::Map::new(),
|
||||
local_aliases: crdt::LwwMap::new(),
|
||||
})
|
||||
};
|
||||
Key {
|
||||
key_id: old_k.key_id,
|
||||
state,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl garage_util::migrate::InitialFormat for Key {}
|
||||
}
|
||||
|
||||
pub use v08::*;
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
#[macro_use]
|
||||
extern crate tracing;
|
||||
|
||||
// For migration from previous versions
|
||||
pub(crate) mod prev;
|
||||
|
||||
pub mod permission;
|
||||
|
||||
pub mod index_counter;
|
||||
|
@ -18,5 +15,4 @@ pub mod s3;
|
|||
|
||||
pub mod garage;
|
||||
pub mod helper;
|
||||
pub mod migrate;
|
||||
pub mod snapshot;
|
||||
|
|
|
@ -1,108 +0,0 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use garage_util::crdt::*;
|
||||
use garage_util::data::*;
|
||||
use garage_util::encode::nonversioned_decode;
|
||||
use garage_util::error::Error as GarageError;
|
||||
use garage_util::time::*;
|
||||
|
||||
use crate::prev::v051::bucket_table as old_bucket;
|
||||
|
||||
use crate::bucket_alias_table::*;
|
||||
use crate::bucket_table::*;
|
||||
use crate::garage::Garage;
|
||||
use crate::helper::error::*;
|
||||
use crate::permission::*;
|
||||
|
||||
pub struct Migrate {
|
||||
pub garage: Arc<Garage>,
|
||||
}
|
||||
|
||||
impl Migrate {
|
||||
pub async fn migrate_buckets050(&self) -> Result<(), Error> {
|
||||
let tree = self
|
||||
.garage
|
||||
.db
|
||||
.open_tree("bucket:table")
|
||||
.map_err(GarageError::from)?;
|
||||
|
||||
let mut old_buckets = vec![];
|
||||
for res in tree.iter().map_err(GarageError::from)? {
|
||||
let (_k, v) = res.map_err(GarageError::from)?;
|
||||
let bucket =
|
||||
nonversioned_decode::<old_bucket::Bucket>(&v[..]).map_err(GarageError::from)?;
|
||||
old_buckets.push(bucket);
|
||||
}
|
||||
|
||||
for bucket in old_buckets {
|
||||
if let old_bucket::BucketState::Present(p) = bucket.state.get() {
|
||||
self.migrate_buckets050_do_bucket(&bucket, p).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn migrate_buckets050_do_bucket(
|
||||
&self,
|
||||
old_bucket: &old_bucket::Bucket,
|
||||
old_bucket_p: &old_bucket::BucketParams,
|
||||
) -> Result<(), Error> {
|
||||
let bucket_id = blake2sum(old_bucket.name.as_bytes());
|
||||
|
||||
let new_name = if is_valid_bucket_name(&old_bucket.name) {
|
||||
old_bucket.name.clone()
|
||||
} else {
|
||||
// if old bucket name was not valid, replace it by
|
||||
// a hex-encoded name derived from its identifier
|
||||
hex::encode(&bucket_id.as_slice()[..16])
|
||||
};
|
||||
|
||||
let website = if *old_bucket_p.website.get() {
|
||||
Some(WebsiteConfig {
|
||||
index_document: "index.html".into(),
|
||||
error_document: None,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let helper = self.garage.locked_helper().await;
|
||||
|
||||
self.garage
|
||||
.bucket_table
|
||||
.insert(&Bucket {
|
||||
id: bucket_id,
|
||||
state: Deletable::Present(BucketParams {
|
||||
creation_date: now_msec(),
|
||||
authorized_keys: Map::new(),
|
||||
aliases: LwwMap::new(),
|
||||
local_aliases: LwwMap::new(),
|
||||
website_config: Lww::new(website),
|
||||
cors_config: Lww::new(None),
|
||||
lifecycle_config: Lww::new(None),
|
||||
quotas: Lww::new(Default::default()),
|
||||
}),
|
||||
})
|
||||
.await?;
|
||||
|
||||
helper.set_global_bucket_alias(bucket_id, &new_name).await?;
|
||||
|
||||
for (k, ts, perm) in old_bucket_p.authorized_keys.items().iter() {
|
||||
helper
|
||||
.set_bucket_key_permissions(
|
||||
bucket_id,
|
||||
k,
|
||||
BucketKeyPerm {
|
||||
timestamp: *ts,
|
||||
allow_read: perm.allow_read,
|
||||
allow_write: perm.allow_write,
|
||||
allow_owner: false,
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
pub(crate) mod v051;
|
|
@ -1,63 +0,0 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use garage_table::crdt::Crdt;
|
||||
use garage_table::*;
|
||||
|
||||
use crate::key_table::v05::PermissionSet;
|
||||
|
||||
/// A bucket is a collection of objects
|
||||
///
|
||||
/// Its parameters are not directly accessible as:
|
||||
/// - It must be possible to merge paramaters, hence the use of a LWW CRDT.
|
||||
/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present.
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Bucket {
|
||||
/// Name of the bucket
|
||||
pub name: String,
|
||||
/// State, and configuration if not deleted, of the bucket
|
||||
pub state: crdt::Lww<BucketState>,
|
||||
}
|
||||
|
||||
/// State of a bucket
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum BucketState {
|
||||
/// The bucket is deleted
|
||||
Deleted,
|
||||
/// The bucket exists
|
||||
Present(BucketParams),
|
||||
}
|
||||
|
||||
impl Crdt for BucketState {
|
||||
fn merge(&mut self, o: &Self) {
|
||||
match o {
|
||||
BucketState::Deleted => *self = BucketState::Deleted,
|
||||
BucketState::Present(other_params) => {
|
||||
if let BucketState::Present(params) = self {
|
||||
params.merge(other_params);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a bucket
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct BucketParams {
|
||||
/// Map of key with access to the bucket, and what kind of access they give
|
||||
pub authorized_keys: crdt::LwwMap<String, PermissionSet>,
|
||||
/// Is the bucket served as http
|
||||
pub website: crdt::Lww<bool>,
|
||||
}
|
||||
|
||||
impl Crdt for BucketParams {
|
||||
fn merge(&mut self, o: &Self) {
|
||||
self.authorized_keys.merge(&o.authorized_keys);
|
||||
self.website.merge(&o.website);
|
||||
}
|
||||
}
|
||||
|
||||
impl Crdt for Bucket {
|
||||
fn merge(&mut self, other: &Self) {
|
||||
self.state.merge(&other.state);
|
||||
}
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
pub(crate) mod bucket_table;
|
|
@ -3,8 +3,12 @@ use std::sync::Arc;
|
|||
use garage_db as db;
|
||||
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::*;
|
||||
use garage_util::migrate::Migrate;
|
||||
|
||||
use garage_block::CalculateRefcount;
|
||||
use garage_table::crdt::Crdt;
|
||||
use garage_table::replication::TableShardedReplication;
|
||||
use garage_table::*;
|
||||
|
||||
use garage_block::manager::*;
|
||||
|
@ -84,3 +88,38 @@ impl TableSchema for BlockRefTable {
|
|||
filter.apply(entry.deleted.get())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn block_ref_recount_fn(
|
||||
block_ref_table: &Arc<Table<BlockRefTable, TableShardedReplication>>,
|
||||
) -> CalculateRefcount {
|
||||
let table = Arc::downgrade(block_ref_table);
|
||||
Box::new(move |tx: &db::Transaction, block: &Hash| {
|
||||
let table = table
|
||||
.upgrade()
|
||||
.ok_or_message("cannot upgrade weak ptr to block_ref_table")
|
||||
.map_err(db::TxError::Abort)?;
|
||||
Ok(calculate_refcount(&table, tx, block)?)
|
||||
})
|
||||
}
|
||||
|
||||
fn calculate_refcount(
|
||||
block_ref_table: &Table<BlockRefTable, TableShardedReplication>,
|
||||
tx: &db::Transaction,
|
||||
block: &Hash,
|
||||
) -> db::TxResult<usize, Error> {
|
||||
let mut result = 0;
|
||||
for entry in tx.range(&block_ref_table.data.store, block.as_slice()..)? {
|
||||
let (key, value) = entry?;
|
||||
if &key[..32] != block.as_slice() {
|
||||
break;
|
||||
}
|
||||
let value = BlockRef::decode(&value)
|
||||
.ok_or_message("could not decode block_ref")
|
||||
.map_err(db::TxError::Abort)?;
|
||||
assert_eq!(value.block, *block);
|
||||
if !value.deleted.get() {
|
||||
result += 1;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
|
|
@ -121,13 +121,7 @@ impl Worker for LifecycleWorker {
|
|||
mpu_aborted,
|
||||
..
|
||||
} => {
|
||||
let n_objects = self
|
||||
.garage
|
||||
.object_table
|
||||
.data
|
||||
.store
|
||||
.fast_len()
|
||||
.unwrap_or(None);
|
||||
let n_objects = self.garage.object_table.data.store.len().ok();
|
||||
let progress = match n_objects {
|
||||
None => "...".to_string(),
|
||||
Some(total) => format!(
|
||||
|
|
|
@ -17,6 +17,7 @@ pub const PARTS: &str = "parts";
|
|||
pub const BYTES: &str = "bytes";
|
||||
|
||||
mod v09 {
|
||||
use crate::s3::object_table::ChecksumValue;
|
||||
use garage_util::crdt;
|
||||
use garage_util::data::Uuid;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -61,6 +62,9 @@ mod v09 {
|
|||
pub version: Uuid,
|
||||
/// ETag of the content of this part (known only once done uploading)
|
||||
pub etag: Option<String>,
|
||||
/// Checksum requested by x-amz-checksum-algorithm
|
||||
#[serde(default)]
|
||||
pub checksum: Option<ChecksumValue>,
|
||||
/// Size of this part (known only once done uploading)
|
||||
pub size: Option<u64>,
|
||||
}
|
||||
|
@ -155,6 +159,11 @@ impl Crdt for MpuPart {
|
|||
(Some(x), Some(y)) if x < y => other.size,
|
||||
(x, _) => x,
|
||||
};
|
||||
self.checksum = match (self.checksum.take(), &other.checksum) {
|
||||
(None, Some(_)) => other.checksum.clone(),
|
||||
(Some(x), Some(y)) if x < *y => other.checksum.clone(),
|
||||
(x, _) => x,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ pub const OBJECTS: &str = "objects";
|
|||
pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads";
|
||||
pub const BYTES: &str = "bytes";
|
||||
|
||||
mod v05 {
|
||||
mod v08 {
|
||||
use garage_util::data::{Hash, Uuid};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
|
@ -26,7 +26,7 @@ mod v05 {
|
|||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Object {
|
||||
/// The bucket in which the object is stored, used as partition key
|
||||
pub bucket: String,
|
||||
pub bucket_id: Uuid,
|
||||
|
||||
/// The key at which the object is stored in its bucket, used as sorting key
|
||||
pub key: String,
|
||||
|
@ -92,45 +92,6 @@ mod v05 {
|
|||
impl garage_util::migrate::InitialFormat for Object {}
|
||||
}
|
||||
|
||||
mod v08 {
|
||||
use garage_util::data::Uuid;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::v05;
|
||||
|
||||
pub use v05::{
|
||||
ObjectVersion, ObjectVersionData, ObjectVersionHeaders, ObjectVersionMeta,
|
||||
ObjectVersionState,
|
||||
};
|
||||
|
||||
/// An object
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Object {
|
||||
/// The bucket in which the object is stored, used as partition key
|
||||
pub bucket_id: Uuid,
|
||||
|
||||
/// The key at which the object is stored in its bucket, used as sorting key
|
||||
pub key: String,
|
||||
|
||||
/// The list of currenty stored versions of the object
|
||||
pub(super) versions: Vec<ObjectVersion>,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::Migrate for Object {
|
||||
type Previous = v05::Object;
|
||||
|
||||
fn migrate(old: v05::Object) -> Object {
|
||||
use garage_util::data::blake2sum;
|
||||
|
||||
Object {
|
||||
bucket_id: blake2sum(old.bucket.as_bytes()),
|
||||
key: old.key,
|
||||
versions: old.versions,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod v09 {
|
||||
use garage_util::data::Uuid;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -210,7 +171,207 @@ mod v09 {
|
|||
}
|
||||
}
|
||||
|
||||
pub use v09::*;
|
||||
mod v010 {
|
||||
use garage_util::data::{Hash, Uuid};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::v09;
|
||||
|
||||
/// An object
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Object {
|
||||
/// The bucket in which the object is stored, used as partition key
|
||||
pub bucket_id: Uuid,
|
||||
|
||||
/// The key at which the object is stored in its bucket, used as sorting key
|
||||
pub key: String,
|
||||
|
||||
/// The list of currenty stored versions of the object
|
||||
pub(super) versions: Vec<ObjectVersion>,
|
||||
}
|
||||
|
||||
/// Informations about a version of an object
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ObjectVersion {
|
||||
/// Id of the version
|
||||
pub uuid: Uuid,
|
||||
/// Timestamp of when the object was created
|
||||
pub timestamp: u64,
|
||||
/// State of the version
|
||||
pub state: ObjectVersionState,
|
||||
}
|
||||
|
||||
/// State of an object version
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum ObjectVersionState {
|
||||
/// The version is being received
|
||||
Uploading {
|
||||
/// Indicates whether this is a multipart upload
|
||||
multipart: bool,
|
||||
/// Checksum algorithm to use
|
||||
checksum_algorithm: Option<ChecksumAlgorithm>,
|
||||
/// Encryption params + headers to be included in the final object
|
||||
encryption: ObjectVersionEncryption,
|
||||
},
|
||||
/// The version is fully received
|
||||
Complete(ObjectVersionData),
|
||||
/// The version uploaded containded errors or the upload was explicitly aborted
|
||||
Aborted,
|
||||
}
|
||||
|
||||
/// Data stored in object version
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum ObjectVersionData {
|
||||
/// The object was deleted, this Version is a tombstone to mark it as such
|
||||
DeleteMarker,
|
||||
/// The object is short, it's stored inlined.
|
||||
/// It is never compressed. For encrypted objects, it is encrypted using
|
||||
/// AES256-GCM, like the encrypted headers.
|
||||
Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
|
||||
/// The object is not short, Hash of first block is stored here, next segments hashes are
|
||||
/// stored in the version table
|
||||
FirstBlock(ObjectVersionMeta, Hash),
|
||||
}
|
||||
|
||||
/// Metadata about the object version
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ObjectVersionMeta {
|
||||
/// Size of the object. If object is encrypted/compressed,
|
||||
/// this is always the size of the unencrypted/uncompressed data
|
||||
pub size: u64,
|
||||
/// etag of the object
|
||||
pub etag: String,
|
||||
/// Encryption params + headers (encrypted or plaintext)
|
||||
pub encryption: ObjectVersionEncryption,
|
||||
}
|
||||
|
||||
/// Encryption information + metadata
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum ObjectVersionEncryption {
|
||||
SseC {
|
||||
/// Encrypted serialized ObjectVersionInner struct.
|
||||
/// This is never compressed, just encrypted using AES256-GCM.
|
||||
#[serde(with = "serde_bytes")]
|
||||
inner: Vec<u8>,
|
||||
/// Whether data blocks are compressed in addition to being encrypted
|
||||
/// (compression happens before encryption, whereas for non-encrypted
|
||||
/// objects, compression is handled at the level of the block manager)
|
||||
compressed: bool,
|
||||
},
|
||||
Plaintext {
|
||||
/// Plain-text headers
|
||||
inner: ObjectVersionMetaInner,
|
||||
},
|
||||
}
|
||||
|
||||
/// Vector of headers, as tuples of the format (header name, header value)
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ObjectVersionMetaInner {
|
||||
pub headers: HeaderList,
|
||||
pub checksum: Option<ChecksumValue>,
|
||||
}
|
||||
|
||||
pub type HeaderList = Vec<(String, String)>;
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub enum ChecksumAlgorithm {
|
||||
Crc32,
|
||||
Crc32c,
|
||||
Sha1,
|
||||
Sha256,
|
||||
}
|
||||
|
||||
/// Checksum value for x-amz-checksum-algorithm
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub enum ChecksumValue {
|
||||
Crc32(#[serde(with = "serde_bytes")] [u8; 4]),
|
||||
Crc32c(#[serde(with = "serde_bytes")] [u8; 4]),
|
||||
Sha1(#[serde(with = "serde_bytes")] [u8; 20]),
|
||||
Sha256(#[serde(with = "serde_bytes")] [u8; 32]),
|
||||
}
|
||||
|
||||
impl garage_util::migrate::Migrate for Object {
|
||||
const VERSION_MARKER: &'static [u8] = b"G010s3ob";
|
||||
|
||||
type Previous = v09::Object;
|
||||
|
||||
fn migrate(old: v09::Object) -> Object {
|
||||
Object {
|
||||
bucket_id: old.bucket_id,
|
||||
key: old.key,
|
||||
versions: old.versions.into_iter().map(migrate_version).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn migrate_version(old: v09::ObjectVersion) -> ObjectVersion {
|
||||
ObjectVersion {
|
||||
uuid: old.uuid,
|
||||
timestamp: old.timestamp,
|
||||
state: match old.state {
|
||||
v09::ObjectVersionState::Uploading { multipart, headers } => {
|
||||
ObjectVersionState::Uploading {
|
||||
multipart,
|
||||
checksum_algorithm: None,
|
||||
encryption: migrate_headers(headers),
|
||||
}
|
||||
}
|
||||
v09::ObjectVersionState::Complete(d) => {
|
||||
ObjectVersionState::Complete(migrate_data(d))
|
||||
}
|
||||
v09::ObjectVersionState::Aborted => ObjectVersionState::Aborted,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn migrate_data(old: v09::ObjectVersionData) -> ObjectVersionData {
|
||||
match old {
|
||||
v09::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker,
|
||||
v09::ObjectVersionData::Inline(meta, data) => {
|
||||
ObjectVersionData::Inline(migrate_meta(meta), data)
|
||||
}
|
||||
v09::ObjectVersionData::FirstBlock(meta, fb) => {
|
||||
ObjectVersionData::FirstBlock(migrate_meta(meta), fb)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn migrate_meta(old: v09::ObjectVersionMeta) -> ObjectVersionMeta {
|
||||
ObjectVersionMeta {
|
||||
size: old.size,
|
||||
etag: old.etag,
|
||||
encryption: migrate_headers(old.headers),
|
||||
}
|
||||
}
|
||||
|
||||
fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption {
|
||||
use http::header::CONTENT_TYPE;
|
||||
|
||||
let mut new_headers = Vec::with_capacity(old.other.len() + 1);
|
||||
if old.content_type != "blob" {
|
||||
new_headers.push((CONTENT_TYPE.as_str().to_string(), old.content_type));
|
||||
}
|
||||
for (name, value) in old.other.into_iter() {
|
||||
new_headers.push((name, value));
|
||||
}
|
||||
|
||||
ObjectVersionEncryption::Plaintext {
|
||||
inner: ObjectVersionMetaInner {
|
||||
headers: new_headers,
|
||||
checksum: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Since ObjectVersionMetaInner can now be serialized independently, for the
|
||||
// purpose of being encrypted, we need it to support migrations on its own
|
||||
// as well.
|
||||
impl garage_util::migrate::InitialFormat for ObjectVersionMetaInner {
|
||||
const VERSION_MARKER: &'static [u8] = b"G010s3om";
|
||||
}
|
||||
}
|
||||
|
||||
pub use v010::*;
|
||||
|
||||
impl Object {
|
||||
/// Initialize an Object struct from parts
|
||||
|
@ -321,6 +482,17 @@ impl Entry<Uuid, String> for Object {
|
|||
}
|
||||
}
|
||||
|
||||
impl ChecksumValue {
|
||||
pub fn algorithm(&self) -> ChecksumAlgorithm {
|
||||
match self {
|
||||
ChecksumValue::Crc32(_) => ChecksumAlgorithm::Crc32,
|
||||
ChecksumValue::Crc32c(_) => ChecksumAlgorithm::Crc32c,
|
||||
ChecksumValue::Sha1(_) => ChecksumAlgorithm::Sha1,
|
||||
ChecksumValue::Sha256(_) => ChecksumAlgorithm::Sha256,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Crdt for Object {
|
||||
fn merge(&mut self, other: &Self) {
|
||||
// Merge versions from other into here
|
||||
|
|
|
@ -11,64 +11,11 @@ use garage_table::*;
|
|||
|
||||
use crate::s3::block_ref_table::*;
|
||||
|
||||
mod v05 {
|
||||
mod v08 {
|
||||
use garage_util::crdt;
|
||||
use garage_util::data::{Hash, Uuid};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A version of an object
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Version {
|
||||
/// UUID of the version, used as partition key
|
||||
pub uuid: Uuid,
|
||||
|
||||
// Actual data: the blocks for this version
|
||||
// In the case of a multipart upload, also store the etags
|
||||
// of individual parts and check them when doing CompleteMultipartUpload
|
||||
/// Is this version deleted
|
||||
pub deleted: crdt::Bool,
|
||||
/// list of blocks of data composing the version
|
||||
pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
|
||||
/// Etag of each part in case of a multipart upload, empty otherwise
|
||||
pub parts_etags: crdt::Map<u64, String>,
|
||||
|
||||
// Back link to bucket+key so that we can figure if
|
||||
// this was deleted later on
|
||||
/// Bucket in which the related object is stored
|
||||
pub bucket: String,
|
||||
/// Key in which the related object is stored
|
||||
pub key: String,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct VersionBlockKey {
|
||||
/// Number of the part
|
||||
pub part_number: u64,
|
||||
/// Offset of this sub-segment in its part
|
||||
pub offset: u64,
|
||||
}
|
||||
|
||||
/// Informations about a single block
|
||||
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct VersionBlock {
|
||||
/// Blake2 sum of the block
|
||||
pub hash: Hash,
|
||||
/// Size of the block
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::InitialFormat for Version {}
|
||||
}
|
||||
|
||||
mod v08 {
|
||||
use garage_util::crdt;
|
||||
use garage_util::data::Uuid;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::v05;
|
||||
|
||||
pub use v05::{VersionBlock, VersionBlockKey};
|
||||
|
||||
/// A version of an object
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Version {
|
||||
|
@ -93,22 +40,25 @@ mod v08 {
|
|||
pub key: String,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::Migrate for Version {
|
||||
type Previous = v05::Version;
|
||||
#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct VersionBlockKey {
|
||||
/// Number of the part
|
||||
pub part_number: u64,
|
||||
/// Offset of this sub-segment in its part as sent by the client
|
||||
/// (before any kind of compression or encryption)
|
||||
pub offset: u64,
|
||||
}
|
||||
|
||||
fn migrate(old: v05::Version) -> Version {
|
||||
use garage_util::data::blake2sum;
|
||||
/// Informations about a single block
|
||||
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct VersionBlock {
|
||||
/// Blake2 sum of the block
|
||||
pub hash: Hash,
|
||||
/// Size of the block, before any kind of compression or encryption
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
Version {
|
||||
uuid: old.uuid,
|
||||
deleted: old.deleted,
|
||||
blocks: old.blocks,
|
||||
parts_etags: old.parts_etags,
|
||||
bucket_id: blake2sum(old.bucket.as_bytes()),
|
||||
key: old.key,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl garage_util::migrate::InitialFormat for Version {}
|
||||
}
|
||||
|
||||
pub(crate) mod v09 {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage_net"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
|
|
@ -35,8 +35,10 @@ pub type NetworkKey = sodiumoxide::crypto::auth::Key;
|
|||
/// composed of 8 bytes for Netapp version and 8 bytes for client version
|
||||
pub(crate) type VersionTag = [u8; 16];
|
||||
|
||||
/// Value of the Netapp version used in the version tag
|
||||
pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6e65746170700005; // netapp 0x0005
|
||||
/// Value of garage_net version used in the version tag
|
||||
/// We are no longer using prefix `netapp` as garage_net is forked from the netapp crate.
|
||||
/// Since Garage v1.0, we have replaced the prefix by `grgnet` (shorthand for garage_net).
|
||||
pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6772676e65740010; // grgnet 0x0010 (1.0)
|
||||
|
||||
/// HelloMessage is sent by the client on a Netapp connection to indicate
|
||||
/// that they are also a server and ready to recieve incoming connections
|
||||
|
@ -123,7 +125,7 @@ impl NetApp {
|
|||
|
||||
netapp
|
||||
.hello_endpoint
|
||||
.swap(Some(netapp.endpoint("__netapp/netapp.rs/Hello".into())));
|
||||
.swap(Some(netapp.endpoint("garage_net/netapp.rs/Hello".into())));
|
||||
netapp
|
||||
.hello_endpoint
|
||||
.load_full()
|
||||
|
@ -292,13 +294,7 @@ impl NetApp {
|
|||
/// the other node with `Netapp::request`
|
||||
pub async fn try_connect(self: Arc<Self>, ip: SocketAddr, id: NodeID) -> Result<(), Error> {
|
||||
// Don't connect to ourself, we don't care
|
||||
// but pretend we did
|
||||
if id == self.id {
|
||||
tokio::spawn(async move {
|
||||
if let Some(h) = self.on_connected_handler.load().as_ref() {
|
||||
h(id, ip, false);
|
||||
}
|
||||
});
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
@ -327,9 +323,15 @@ impl NetApp {
|
|||
/// Close the outgoing connection we have to a node specified by its public key,
|
||||
/// if such a connection is currently open.
|
||||
pub fn disconnect(self: &Arc<Self>, id: &NodeID) {
|
||||
// If id is ourself, we're not supposed to have a connection open
|
||||
if *id != self.id {
|
||||
let conn = self.client_conns.write().unwrap().remove(id);
|
||||
|
||||
// If id is ourself, we're not supposed to have a connection open
|
||||
if *id == self.id {
|
||||
// sanity check
|
||||
assert!(conn.is_none(), "had a connection to local node");
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(c) = conn {
|
||||
debug!(
|
||||
"Closing connection to {} ({})",
|
||||
|
@ -337,14 +339,8 @@ impl NetApp {
|
|||
c.remote_addr
|
||||
);
|
||||
c.close();
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// call on_disconnected_handler immediately, since the connection
|
||||
// was removed
|
||||
// (if id == self.id, we pretend we disconnected)
|
||||
// call on_disconnected_handler immediately, since the connection was removed
|
||||
let id = *id;
|
||||
let self2 = self.clone();
|
||||
tokio::spawn(async move {
|
||||
|
@ -353,6 +349,7 @@ impl NetApp {
|
|||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Called from conn.rs when an incoming connection is successfully established
|
||||
// Registers the connection in our list of connections
|
||||
|
|
|
@ -54,12 +54,8 @@ impl Message for PeerListMessage {
|
|||
|
||||
#[derive(Debug)]
|
||||
struct PeerInfoInternal {
|
||||
// addr is the currently connected address,
|
||||
// or the last address we were connected to,
|
||||
// or an arbitrary address some other peer gave us
|
||||
addr: SocketAddr,
|
||||
// all_addrs contains all of the addresses everyone gave us
|
||||
all_addrs: Vec<SocketAddr>,
|
||||
// known_addrs contains all of the addresses everyone gave us
|
||||
known_addrs: Vec<SocketAddr>,
|
||||
|
||||
state: PeerConnState,
|
||||
last_send_ping: Option<Instant>,
|
||||
|
@ -69,10 +65,9 @@ struct PeerInfoInternal {
|
|||
}
|
||||
|
||||
impl PeerInfoInternal {
|
||||
fn new(addr: SocketAddr, state: PeerConnState) -> Self {
|
||||
fn new(state: PeerConnState, known_addr: Option<SocketAddr>) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
all_addrs: vec![addr],
|
||||
known_addrs: known_addr.map(|x| vec![x]).unwrap_or_default(),
|
||||
state,
|
||||
last_send_ping: None,
|
||||
last_seen: None,
|
||||
|
@ -81,8 +76,8 @@ impl PeerInfoInternal {
|
|||
}
|
||||
}
|
||||
fn add_addr(&mut self, addr: SocketAddr) -> bool {
|
||||
if !self.all_addrs.contains(&addr) {
|
||||
self.all_addrs.push(addr);
|
||||
if !self.known_addrs.contains(&addr) {
|
||||
self.known_addrs.push(addr);
|
||||
// If we are learning a new address for this node,
|
||||
// we want to retry connecting
|
||||
self.state = match self.state {
|
||||
|
@ -90,7 +85,7 @@ impl PeerInfoInternal {
|
|||
PeerConnState::Waiting(_, _) | PeerConnState::Abandonned => {
|
||||
PeerConnState::Waiting(0, Instant::now())
|
||||
}
|
||||
x @ (PeerConnState::Ourself | PeerConnState::Connected) => x,
|
||||
x @ (PeerConnState::Ourself | PeerConnState::Connected { .. }) => x,
|
||||
};
|
||||
true
|
||||
} else {
|
||||
|
@ -104,8 +99,6 @@ impl PeerInfoInternal {
|
|||
pub struct PeerInfo {
|
||||
/// The node's identifier (its public key)
|
||||
pub id: NodeID,
|
||||
/// The node's network address
|
||||
pub addr: SocketAddr,
|
||||
/// The current status of our connection to this node
|
||||
pub state: PeerConnState,
|
||||
/// The last time at which the node was seen
|
||||
|
@ -136,7 +129,7 @@ pub enum PeerConnState {
|
|||
Ourself,
|
||||
|
||||
/// We currently have a connection to this peer
|
||||
Connected,
|
||||
Connected { addr: SocketAddr },
|
||||
|
||||
/// Our next connection tentative (the nth, where n is the first value of the tuple)
|
||||
/// will be at given Instant
|
||||
|
@ -152,7 +145,7 @@ pub enum PeerConnState {
|
|||
impl PeerConnState {
|
||||
/// Returns true if we can currently send requests to this peer
|
||||
pub fn is_up(&self) -> bool {
|
||||
matches!(self, Self::Ourself | Self::Connected)
|
||||
matches!(self, Self::Ourself | Self::Connected { .. })
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -164,29 +157,42 @@ struct KnownHosts {
|
|||
impl KnownHosts {
|
||||
fn new() -> Self {
|
||||
let list = HashMap::new();
|
||||
let hash = Self::calculate_hash(vec![]);
|
||||
Self { list, hash }
|
||||
let mut ret = Self {
|
||||
list,
|
||||
hash: hash::Digest::from_slice(&[0u8; 64][..]).unwrap(),
|
||||
};
|
||||
ret.update_hash();
|
||||
ret
|
||||
}
|
||||
fn update_hash(&mut self) {
|
||||
self.hash = Self::calculate_hash(self.connected_peers_vec());
|
||||
}
|
||||
fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> {
|
||||
let mut list = Vec::with_capacity(self.list.len());
|
||||
for (id, peer) in self.list.iter() {
|
||||
if peer.state.is_up() {
|
||||
list.push((*id, peer.addr));
|
||||
}
|
||||
}
|
||||
list
|
||||
}
|
||||
fn calculate_hash(mut list: Vec<(NodeID, SocketAddr)>) -> hash::Digest {
|
||||
// The hash is a value that is exchanged between nodes when they ping one
|
||||
// another. Nodes compare their known hosts hash to know if they are connected
|
||||
// to the same set of nodes. If the hashes differ, they are connected to
|
||||
// different nodes and they trigger an exchange of the full list of active
|
||||
// connections. The hash value only represents the set of node IDs and not
|
||||
// their actual socket addresses, because nodes can be connected via different
|
||||
// addresses and that shouldn't necessarily trigger a full peer exchange.
|
||||
let mut list = self
|
||||
.list
|
||||
.iter()
|
||||
.filter(|(_, peer)| peer.state.is_up())
|
||||
.map(|(id, _)| *id)
|
||||
.collect::<Vec<_>>();
|
||||
list.sort();
|
||||
let mut hash_state = hash::State::new();
|
||||
for (id, addr) in list {
|
||||
for id in list {
|
||||
hash_state.update(&id[..]);
|
||||
hash_state.update(&format!("{}\n", addr).into_bytes()[..]);
|
||||
}
|
||||
hash_state.finalize()
|
||||
self.hash = hash_state.finalize();
|
||||
}
|
||||
fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> {
|
||||
self.list
|
||||
.iter()
|
||||
.filter_map(|(id, peer)| match peer.state {
|
||||
PeerConnState::Connected { addr } => Some((*id, addr)),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -220,27 +226,24 @@ impl PeeringManager {
|
|||
if id != netapp.id {
|
||||
known_hosts.list.insert(
|
||||
id,
|
||||
PeerInfoInternal::new(addr, PeerConnState::Waiting(0, Instant::now())),
|
||||
PeerInfoInternal::new(PeerConnState::Waiting(0, Instant::now()), Some(addr)),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(addr) = our_addr {
|
||||
known_hosts.list.insert(
|
||||
netapp.id,
|
||||
PeerInfoInternal::new(addr, PeerConnState::Ourself),
|
||||
PeerInfoInternal::new(PeerConnState::Ourself, our_addr),
|
||||
);
|
||||
known_hosts.update_hash();
|
||||
}
|
||||
|
||||
// TODO for v0.10 / v1.0 : rename the endpoint (it will break compatibility)
|
||||
let strat = Arc::new(Self {
|
||||
netapp: netapp.clone(),
|
||||
known_hosts: RwLock::new(known_hosts),
|
||||
public_peer_list: ArcSwap::new(Arc::new(Vec::new())),
|
||||
next_ping_id: AtomicU64::new(42),
|
||||
ping_endpoint: netapp.endpoint("__netapp/peering/fullmesh.rs/Ping".into()),
|
||||
peer_list_endpoint: netapp.endpoint("__netapp/peering/fullmesh.rs/PeerList".into()),
|
||||
ping_endpoint: netapp.endpoint("garage_net/peering.rs/Ping".into()),
|
||||
peer_list_endpoint: netapp.endpoint("garage_net/peering.rs/PeerList".into()),
|
||||
ping_timeout_millis: DEFAULT_PING_TIMEOUT_MILLIS.into(),
|
||||
});
|
||||
|
||||
|
@ -276,7 +279,7 @@ impl PeeringManager {
|
|||
for (id, info) in known_hosts.list.iter() {
|
||||
trace!("{}, {:?}", hex::encode(&id[..8]), info);
|
||||
match info.state {
|
||||
PeerConnState::Connected => {
|
||||
PeerConnState::Connected { .. } => {
|
||||
let must_ping = match info.last_send_ping {
|
||||
None => true,
|
||||
Some(t) => Instant::now() - t > PING_INTERVAL,
|
||||
|
@ -319,7 +322,7 @@ impl PeeringManager {
|
|||
info!(
|
||||
"Retrying connection to {} at {} ({})",
|
||||
hex::encode(&id[..8]),
|
||||
h.all_addrs
|
||||
h.known_addrs
|
||||
.iter()
|
||||
.map(|x| format!("{}", x))
|
||||
.collect::<Vec<_>>()
|
||||
|
@ -328,13 +331,8 @@ impl PeeringManager {
|
|||
);
|
||||
h.state = PeerConnState::Trying(i);
|
||||
|
||||
let alternate_addrs = h
|
||||
.all_addrs
|
||||
.iter()
|
||||
.filter(|x| **x != h.addr)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
tokio::spawn(self.clone().try_connect(id, h.addr, alternate_addrs));
|
||||
let addresses = h.known_addrs.clone();
|
||||
tokio::spawn(self.clone().try_connect(id, addresses));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -362,27 +360,24 @@ impl PeeringManager {
|
|||
fn update_public_peer_list(&self, known_hosts: &KnownHosts) {
|
||||
let mut pub_peer_list = Vec::with_capacity(known_hosts.list.len());
|
||||
for (id, info) in known_hosts.list.iter() {
|
||||
if *id == self.netapp.id {
|
||||
// sanity check
|
||||
assert!(matches!(info.state, PeerConnState::Ourself));
|
||||
}
|
||||
let mut pings = info.ping.iter().cloned().collect::<Vec<_>>();
|
||||
pings.sort();
|
||||
if !pings.is_empty() {
|
||||
pub_peer_list.push(PeerInfo {
|
||||
id: *id,
|
||||
addr: info.addr,
|
||||
state: info.state,
|
||||
last_seen: info.last_seen,
|
||||
avg_ping: Some(
|
||||
pings
|
||||
.iter()
|
||||
.fold(Duration::from_secs(0), |x, y| x + *y)
|
||||
.div_f64(pings.len() as f64),
|
||||
),
|
||||
avg_ping: Some(pings.iter().sum::<Duration>().div_f64(pings.len() as f64)),
|
||||
max_ping: pings.last().cloned(),
|
||||
med_ping: Some(pings[pings.len() / 2]),
|
||||
});
|
||||
} else {
|
||||
pub_peer_list.push(PeerInfo {
|
||||
id: *id,
|
||||
addr: info.addr,
|
||||
state: info.state,
|
||||
last_seen: info.last_seen,
|
||||
avg_ping: None,
|
||||
|
@ -495,15 +490,10 @@ impl PeeringManager {
|
|||
}
|
||||
}
|
||||
|
||||
async fn try_connect(
|
||||
self: Arc<Self>,
|
||||
id: NodeID,
|
||||
default_addr: SocketAddr,
|
||||
alternate_addrs: Vec<SocketAddr>,
|
||||
) {
|
||||
async fn try_connect(self: Arc<Self>, id: NodeID, addresses: Vec<SocketAddr>) {
|
||||
let conn_addr = {
|
||||
let mut ret = None;
|
||||
for addr in [default_addr].iter().chain(alternate_addrs.iter()) {
|
||||
for addr in addresses.iter() {
|
||||
debug!("Trying address {} for peer {}", addr, hex::encode(&id[..8]));
|
||||
match self.netapp.clone().try_connect(*addr, id).await {
|
||||
Ok(()) => {
|
||||
|
@ -529,7 +519,7 @@ impl PeeringManager {
|
|||
warn!(
|
||||
"Could not connect to peer {} ({} addresses tried)",
|
||||
hex::encode(&id[..8]),
|
||||
1 + alternate_addrs.len()
|
||||
addresses.len()
|
||||
);
|
||||
let mut known_hosts = self.known_hosts.write().unwrap();
|
||||
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||
|
@ -549,6 +539,14 @@ impl PeeringManager {
|
|||
}
|
||||
|
||||
fn on_connected(self: &Arc<Self>, id: NodeID, addr: SocketAddr, is_incoming: bool) {
|
||||
if id == self.netapp.id {
|
||||
// sanity check
|
||||
panic!(
|
||||
"on_connected from local node, id={:?}, addr={}, incoming={}",
|
||||
id, addr, is_incoming
|
||||
);
|
||||
}
|
||||
|
||||
let mut known_hosts = self.known_hosts.write().unwrap();
|
||||
if is_incoming {
|
||||
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||
|
@ -563,13 +561,13 @@ impl PeeringManager {
|
|||
addr
|
||||
);
|
||||
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||
host.state = PeerConnState::Connected;
|
||||
host.addr = addr;
|
||||
host.state = PeerConnState::Connected { addr };
|
||||
host.add_addr(addr);
|
||||
} else {
|
||||
known_hosts
|
||||
.list
|
||||
.insert(id, PeerInfoInternal::new(addr, PeerConnState::Connected));
|
||||
known_hosts.list.insert(
|
||||
id,
|
||||
PeerInfoInternal::new(PeerConnState::Connected { addr }, Some(addr)),
|
||||
);
|
||||
}
|
||||
}
|
||||
known_hosts.update_hash();
|
||||
|
@ -589,12 +587,8 @@ impl PeeringManager {
|
|||
}
|
||||
|
||||
fn new_peer(&self, id: &NodeID, addr: SocketAddr) -> PeerInfoInternal {
|
||||
let state = if *id == self.netapp.id {
|
||||
PeerConnState::Ourself
|
||||
} else {
|
||||
PeerConnState::Waiting(0, Instant::now())
|
||||
};
|
||||
PeerInfoInternal::new(addr, state)
|
||||
assert!(*id != self.netapp.id);
|
||||
PeerInfoInternal::new(PeerConnState::Waiting(0, Instant::now()), Some(addr))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "garage_rpc"
|
||||
version = "0.9.4"
|
||||
version = "1.0.0"
|
||||
authors = ["Alex Auvolat <alex@adnab.me>"]
|
||||
edition = "2018"
|
||||
license = "AGPL-3.0"
|
||||
|
|
|
@ -114,16 +114,6 @@ impl Graph<FlowEdge> {
|
|||
Ok(result)
|
||||
}
|
||||
|
||||
/// This function returns the value of the flow incoming to v.
|
||||
pub fn get_inflow(&self, v: Vertex) -> Result<i64, String> {
|
||||
let idv = self.get_vertex_id(&v)?;
|
||||
let mut result = 0;
|
||||
for edge in self.graph[idv].iter() {
|
||||
result += max(0, self.graph[edge.dest][edge.rev].flow);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// This function returns the value of the flow outgoing from v.
|
||||
pub fn get_outflow(&self, v: Vertex) -> Result<i64, String> {
|
||||
let idv = self.get_vertex_id(&v)?;
|
299
src/rpc/layout/helper.rs
Normal file
299
src/rpc/layout/helper.rs
Normal file
|
@ -0,0 +1,299 @@
|
|||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use garage_util::data::*;
|
||||
|
||||
use super::*;
|
||||
use crate::replication_mode::*;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
|
||||
pub struct RpcLayoutDigest {
|
||||
/// Cluster layout version
|
||||
pub current_version: u64,
|
||||
/// Number of active layout versions
|
||||
pub active_versions: usize,
|
||||
/// Hash of cluster layout update trackers
|
||||
pub trackers_hash: Hash,
|
||||
/// Hash of cluster layout staging data
|
||||
pub staging_hash: Hash,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub struct SyncLayoutDigest {
|
||||
current: u64,
|
||||
ack_map_min: u64,
|
||||
min_stored: u64,
|
||||
}
|
||||
|
||||
pub struct LayoutHelper {
|
||||
replication_factor: ReplicationFactor,
|
||||
consistency_mode: ConsistencyMode,
|
||||
layout: Option<LayoutHistory>,
|
||||
|
||||
// cached values
|
||||
ack_map_min: u64,
|
||||
sync_map_min: u64,
|
||||
|
||||
all_nodes: Vec<Uuid>,
|
||||
all_nongateway_nodes: Vec<Uuid>,
|
||||
|
||||
trackers_hash: Hash,
|
||||
staging_hash: Hash,
|
||||
is_check_ok: bool,
|
||||
|
||||
// ack lock: counts in-progress write operations for each
|
||||
// layout version ; we don't increase the ack update tracker
|
||||
// while this lock is nonzero
|
||||
pub(crate) ack_lock: HashMap<u64, AtomicUsize>,
|
||||
}
|
||||
|
||||
impl LayoutHelper {
|
||||
pub fn new(
|
||||
replication_factor: ReplicationFactor,
|
||||
consistency_mode: ConsistencyMode,
|
||||
mut layout: LayoutHistory,
|
||||
mut ack_lock: HashMap<u64, AtomicUsize>,
|
||||
) -> Self {
|
||||
// In the new() function of the helper, we do a bunch of cleanup
|
||||
// and calculations on the layout history to make sure things are
|
||||
// correct and we have rapid access to important values such as
|
||||
// the layout versions to use when reading to ensure consistency.
|
||||
|
||||
if consistency_mode != ConsistencyMode::Consistent {
|
||||
// Fast path for when no consistency is required.
|
||||
// In this case we only need to keep the last version of the layout,
|
||||
// we don't care about coordinating stuff in the cluster.
|
||||
layout.keep_current_version_only();
|
||||
}
|
||||
|
||||
layout.cleanup_old_versions();
|
||||
|
||||
let all_nodes = layout.get_all_nodes();
|
||||
let all_nongateway_nodes = layout.get_all_nongateway_nodes();
|
||||
|
||||
layout.clamp_update_trackers(&all_nodes);
|
||||
|
||||
let min_version = layout.min_stored();
|
||||
|
||||
// ack_map_min is the minimum value of ack_map among all nodes
|
||||
// in the cluster (gateway, non-gateway, current and previous layouts).
|
||||
// It is the highest layout version which all of these nodes have
|
||||
// acknowledged, indicating that they are aware of it and are no
|
||||
// longer processing write operations that did not take it into account.
|
||||
let ack_map_min = layout
|
||||
.update_trackers
|
||||
.ack_map
|
||||
.min_among(&all_nodes, min_version);
|
||||
|
||||
// sync_map_min is the minimum value of sync_map among storage nodes
|
||||
// in the cluster (non-gateway nodes only, current and previous layouts).
|
||||
// It is the highest layout version for which we know that all relevant
|
||||
// storage nodes have fullfilled a sync, and therefore it is safe to
|
||||
// use a read quorum within that layout to ensure consistency.
|
||||
// Gateway nodes are excluded here because they hold no relevant data
|
||||
// (they store the bucket and access key tables, but we don't have
|
||||
// consistency on those).
|
||||
// This value is calculated using quorums to allow progress even
|
||||
// if not all nodes have successfully completed a sync.
|
||||
let sync_map_min =
|
||||
layout.calculate_sync_map_min_with_quorum(replication_factor, &all_nongateway_nodes);
|
||||
|
||||
let trackers_hash = layout.calculate_trackers_hash();
|
||||
let staging_hash = layout.calculate_staging_hash();
|
||||
|
||||
ack_lock.retain(|_, cnt| *cnt.get_mut() > 0);
|
||||
ack_lock
|
||||
.entry(layout.current().version)
|
||||
.or_insert(AtomicUsize::new(0));
|
||||
|
||||
let is_check_ok = layout.check().is_ok();
|
||||
|
||||
LayoutHelper {
|
||||
replication_factor,
|
||||
consistency_mode,
|
||||
layout: Some(layout),
|
||||
ack_map_min,
|
||||
sync_map_min,
|
||||
all_nodes,
|
||||
all_nongateway_nodes,
|
||||
trackers_hash,
|
||||
staging_hash,
|
||||
ack_lock,
|
||||
is_check_ok,
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------ single updating function --------------
|
||||
|
||||
pub(crate) fn update<F>(&mut self, f: F) -> bool
|
||||
where
|
||||
F: FnOnce(&mut LayoutHistory) -> bool,
|
||||
{
|
||||
let changed = f(self.layout.as_mut().unwrap());
|
||||
if changed {
|
||||
*self = Self::new(
|
||||
self.replication_factor,
|
||||
self.consistency_mode,
|
||||
self.layout.take().unwrap(),
|
||||
std::mem::take(&mut self.ack_lock),
|
||||
);
|
||||
}
|
||||
changed
|
||||
}
|
||||
|
||||
// ------------------ read helpers ---------------
|
||||
|
||||
pub fn inner(&self) -> &LayoutHistory {
|
||||
self.layout.as_ref().unwrap()
|
||||
}
|
||||
|
||||
pub fn current(&self) -> &LayoutVersion {
|
||||
self.inner().current()
|
||||
}
|
||||
|
||||
pub fn versions(&self) -> &[LayoutVersion] {
|
||||
&self.inner().versions
|
||||
}
|
||||
|
||||
pub fn is_check_ok(&self) -> bool {
|
||||
self.is_check_ok
|
||||
}
|
||||
|
||||
/// Return all nodes that have a role (gateway or storage)
|
||||
/// in one of the currently active layout versions
|
||||
pub fn all_nodes(&self) -> &[Uuid] {
|
||||
&self.all_nodes
|
||||
}
|
||||
|
||||
/// Return all nodes that are configured to store data
|
||||
/// in one of the currently active layout versions
|
||||
pub fn all_nongateway_nodes(&self) -> &[Uuid] {
|
||||
&self.all_nongateway_nodes
|
||||
}
|
||||
|
||||
pub fn ack_map_min(&self) -> u64 {
|
||||
self.ack_map_min
|
||||
}
|
||||
|
||||
pub fn sync_map_min(&self) -> u64 {
|
||||
self.sync_map_min
|
||||
}
|
||||
|
||||
pub fn sync_digest(&self) -> SyncLayoutDigest {
|
||||
SyncLayoutDigest {
|
||||
current: self.current().version,
|
||||
ack_map_min: self.ack_map_min(),
|
||||
min_stored: self.inner().min_stored(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
|
||||
let sync_min = self.sync_map_min;
|
||||
let version = self
|
||||
.versions()
|
||||
.iter()
|
||||
.find(|x| x.version == sync_min)
|
||||
.or(self.versions().last())
|
||||
.unwrap();
|
||||
version
|
||||
.nodes_of(position, version.replication_factor)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn storage_sets_of(&self, position: &Hash) -> Vec<Vec<Uuid>> {
|
||||
self.versions()
|
||||
.iter()
|
||||
.map(|x| x.nodes_of(position, x.replication_factor).collect())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn storage_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
|
||||
let mut ret = vec![];
|
||||
for version in self.versions().iter() {
|
||||
ret.extend(version.nodes_of(position, version.replication_factor));
|
||||
}
|
||||
ret.sort();
|
||||
ret.dedup();
|
||||
ret
|
||||
}
|
||||
|
||||
pub fn trackers_hash(&self) -> Hash {
|
||||
self.trackers_hash
|
||||
}
|
||||
|
||||
pub fn staging_hash(&self) -> Hash {
|
||||
self.staging_hash
|
||||
}
|
||||
|
||||
pub fn digest(&self) -> RpcLayoutDigest {
|
||||
RpcLayoutDigest {
|
||||
current_version: self.current().version,
|
||||
active_versions: self.versions().len(),
|
||||
trackers_hash: self.trackers_hash,
|
||||
staging_hash: self.staging_hash,
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------ helpers for update tracking ---------------
|
||||
|
||||
pub(crate) fn update_update_trackers(&mut self, local_node_id: Uuid) {
|
||||
// Ensure trackers for this node's values are up-to-date
|
||||
|
||||
// 1. Acknowledge the last layout version which is not currently
|
||||
// locked by an in-progress write operation
|
||||
self.update_ack_to_max_free(local_node_id);
|
||||
|
||||
// 2. Assume the data on this node is sync'ed up at least to
|
||||
// the first layout version in the history
|
||||
let first_version = self.inner().min_stored();
|
||||
self.update(|layout| {
|
||||
layout
|
||||
.update_trackers
|
||||
.sync_map
|
||||
.set_max(local_node_id, first_version)
|
||||
});
|
||||
|
||||
// 3. Acknowledge everyone has synced up to min(self.sync_map)
|
||||
let sync_map_min = self.sync_map_min;
|
||||
self.update(|layout| {
|
||||
layout
|
||||
.update_trackers
|
||||
.sync_ack_map
|
||||
.set_max(local_node_id, sync_map_min)
|
||||
});
|
||||
|
||||
debug!("ack_map: {:?}", self.inner().update_trackers.ack_map);
|
||||
debug!("sync_map: {:?}", self.inner().update_trackers.sync_map);
|
||||
debug!(
|
||||
"sync_ack_map: {:?}",
|
||||
self.inner().update_trackers.sync_ack_map
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) fn update_ack_to_max_free(&mut self, local_node_id: Uuid) -> bool {
|
||||
let max_free = self
|
||||
.versions()
|
||||
.iter()
|
||||
.map(|x| x.version)
|
||||
.skip_while(|v| {
|
||||
self.ack_lock
|
||||
.get(v)
|
||||
.map(|x| x.load(Ordering::Relaxed) == 0)
|
||||
.unwrap_or(true)
|
||||
})
|
||||
.next()
|
||||
.unwrap_or(self.current().version);
|
||||
let changed = self.update(|layout| {
|
||||
layout
|
||||
.update_trackers
|
||||
.ack_map
|
||||
.set_max(local_node_id, max_free)
|
||||
});
|
||||
if changed {
|
||||
info!("ack_until updated to {}", max_free);
|
||||
}
|
||||
changed
|
||||
}
|
||||
}
|
312
src/rpc/layout/history.rs
Normal file
312
src/rpc/layout/history.rs
Normal file
|
@ -0,0 +1,312 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use garage_util::crdt::{Crdt, Lww, LwwMap};
|
||||
use garage_util::data::*;
|
||||
use garage_util::encode::nonversioned_encode;
|
||||
use garage_util::error::*;
|
||||
|
||||
use super::*;
|
||||
use crate::replication_mode::*;
|
||||
|
||||
impl LayoutHistory {
|
||||
pub fn new(replication_factor: ReplicationFactor) -> Self {
|
||||
let version = LayoutVersion::new(replication_factor.into());
|
||||
|
||||
let staging = LayoutStaging {
|
||||
parameters: Lww::<LayoutParameters>::new(version.parameters),
|
||||
roles: LwwMap::new(),
|
||||
};
|
||||
|
||||
LayoutHistory {
|
||||
versions: vec![version],
|
||||
old_versions: vec![],
|
||||
update_trackers: Default::default(),
|
||||
staging: Lww::raw(0, staging),
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------ who stores what now? ---------------
|
||||
|
||||
/// Returns the layout version with the highest number
|
||||
pub fn current(&self) -> &LayoutVersion {
|
||||
self.versions.last().as_ref().unwrap()
|
||||
}
|
||||
|
||||
/// Returns the version number of the oldest layout version still active
|
||||
pub fn min_stored(&self) -> u64 {
|
||||
self.versions.first().as_ref().unwrap().version
|
||||
}
|
||||
|
||||
/// Calculate the set of all nodes that have a role (gateway or storage)
|
||||
/// in one of the currently active layout versions
|
||||
pub fn get_all_nodes(&self) -> Vec<Uuid> {
|
||||
if self.versions.len() == 1 {
|
||||
self.versions[0].all_nodes().to_vec()
|
||||
} else {
|
||||
let set = self
|
||||
.versions
|
||||
.iter()
|
||||
.flat_map(|x| x.all_nodes())
|
||||
.collect::<HashSet<_>>();
|
||||
set.into_iter().copied().collect::<Vec<_>>()
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate the set of all nodes that are configured to store data
|
||||
/// in one of the currently active layout versions
|
||||
pub(crate) fn get_all_nongateway_nodes(&self) -> Vec<Uuid> {
|
||||
if self.versions.len() == 1 {
|
||||
self.versions[0].nongateway_nodes().to_vec()
|
||||
} else {
|
||||
let set = self
|
||||
.versions
|
||||
.iter()
|
||||
.flat_map(|x| x.nongateway_nodes())
|
||||
.collect::<HashSet<_>>();
|
||||
set.into_iter().copied().collect::<Vec<_>>()
|
||||
}
|
||||
}
|
||||
|
||||
// ---- housekeeping (all invoked by LayoutHelper) ----
|
||||
|
||||
pub(crate) fn keep_current_version_only(&mut self) {
|
||||
while self.versions.len() > 1 {
|
||||
let removed = self.versions.remove(0);
|
||||
self.old_versions.push(removed);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn cleanup_old_versions(&mut self) {
|
||||
// If there are invalid versions before valid versions, remove them
|
||||
if self.versions.len() > 1 && self.current().check().is_ok() {
|
||||
while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() {
|
||||
let removed = self.versions.remove(0);
|
||||
info!(
|
||||
"Layout history: pruning old invalid version {}",
|
||||
removed.version
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// If there are old versions that no one is reading from anymore,
|
||||
// remove them (keep them in self.old_versions).
|
||||
// ASSUMPTION: we only care about where nodes in the current layout version
|
||||
// are reading from, as we assume older nodes are being discarded.
|
||||
let current_nodes = &self.current().node_id_vec;
|
||||
let min_version = self.min_stored();
|
||||
let sync_ack_map_min = self
|
||||
.update_trackers
|
||||
.sync_ack_map
|
||||
.min_among(current_nodes, min_version);
|
||||
while self.min_stored() < sync_ack_map_min {
|
||||
assert!(self.versions.len() > 1);
|
||||
let removed = self.versions.remove(0);
|
||||
info!(
|
||||
"Layout history: moving version {} to old_versions",
|
||||
removed.version
|
||||
);
|
||||
self.old_versions.push(removed);
|
||||
}
|
||||
|
||||
while self.old_versions.len() > OLD_VERSION_COUNT {
|
||||
let removed = self.old_versions.remove(0);
|
||||
info!("Layout history: removing old_version {}", removed.version);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) {
|
||||
let min_v = self.min_stored();
|
||||
for node in nodes {
|
||||
self.update_trackers.ack_map.set_max(*node, min_v);
|
||||
self.update_trackers.sync_map.set_max(*node, min_v);
|
||||
self.update_trackers.sync_ack_map.set_max(*node, min_v);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn calculate_sync_map_min_with_quorum(
|
||||
&self,
|
||||
replication_factor: ReplicationFactor,
|
||||
all_nongateway_nodes: &[Uuid],
|
||||
) -> u64 {
|
||||
// This function calculates the minimum layout version from which
|
||||
// it is safe to read if we want to maintain read-after-write consistency.
|
||||
// In the general case the computation can be a bit expensive so
|
||||
// we try to optimize it in several ways.
|
||||
|
||||
// If there is only one layout version, we know that's the one
|
||||
// we need to read from.
|
||||
if self.versions.len() == 1 {
|
||||
return self.current().version;
|
||||
}
|
||||
|
||||
let quorum = replication_factor.write_quorum(ConsistencyMode::Consistent);
|
||||
|
||||
let min_version = self.min_stored();
|
||||
let global_min = self
|
||||
.update_trackers
|
||||
.sync_map
|
||||
.min_among(all_nongateway_nodes, min_version);
|
||||
|
||||
// If the write quorums are equal to the total number of nodes,
|
||||
// i.e. no writes can succeed while they are not written to all nodes,
|
||||
// then we must in all case wait for all nodes to complete a sync.
|
||||
// This is represented by reading from the layout with version
|
||||
// number global_min, the smallest layout version for which all nodes
|
||||
// have completed a sync.
|
||||
if quorum == self.current().replication_factor {
|
||||
return global_min;
|
||||
}
|
||||
|
||||
// In the general case, we need to look at all write sets for all partitions,
|
||||
// and find a safe layout version to read for that partition. We then
|
||||
// take the minimum value among all partition as the safe layout version
|
||||
// to read in all cases (the layout version to which all reads are directed).
|
||||
let mut current_min = self.current().version;
|
||||
let mut sets_done = HashSet::<Vec<Uuid>>::new();
|
||||
|
||||
for (_, p_hash) in self.current().partitions() {
|
||||
for v in self.versions.iter() {
|
||||
if v.version == self.current().version {
|
||||
// We don't care about whether nodes in the latest layout version
|
||||
// have completed a sync or not, as the sync is push-only
|
||||
// and by definition nodes in the latest layout version do not
|
||||
// hold data that must be pushed to nodes in the latest layout
|
||||
// version, since that's the same version (any data that's
|
||||
// already in the latest version is assumed to have been written
|
||||
// by an operation that ensured a quorum of writes within
|
||||
// that version).
|
||||
continue;
|
||||
}
|
||||
|
||||
// Determine set of nodes for partition p in layout version v.
|
||||
// Sort the node set to avoid duplicate computations.
|
||||
let mut set = v
|
||||
.nodes_of(&p_hash, v.replication_factor)
|
||||
.collect::<Vec<Uuid>>();
|
||||
set.sort();
|
||||
|
||||
// If this set was already processed, skip it.
|
||||
if sets_done.contains(&set) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the value of the sync update trackers that is the
|
||||
// highest possible minimum within a quorum of nodes.
|
||||
let mut sync_values = set
|
||||
.iter()
|
||||
.map(|x| self.update_trackers.sync_map.get(x, min_version))
|
||||
.collect::<Vec<_>>();
|
||||
sync_values.sort();
|
||||
let set_min = sync_values[sync_values.len() - quorum];
|
||||
if set_min < current_min {
|
||||
current_min = set_min;
|
||||
}
|
||||
// defavorable case, we know we are at the smallest possible version,
|
||||
// so we can stop early
|
||||
assert!(current_min >= global_min);
|
||||
if current_min == global_min {
|
||||
return current_min;
|
||||
}
|
||||
|
||||
// Add set to already processed sets
|
||||
sets_done.insert(set);
|
||||
}
|
||||
}
|
||||
|
||||
current_min
|
||||
}
|
||||
|
||||
pub(crate) fn calculate_trackers_hash(&self) -> Hash {
|
||||
blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..])
|
||||
}
|
||||
|
||||
pub(crate) fn calculate_staging_hash(&self) -> Hash {
|
||||
blake2sum(&nonversioned_encode(&self.staging).unwrap()[..])
|
||||
}
|
||||
|
||||
// ================== updates to layout, public interface ===================
|
||||
|
||||
pub fn merge(&mut self, other: &LayoutHistory) -> bool {
|
||||
let mut changed = false;
|
||||
|
||||
// Add any new versions to history
|
||||
for v2 in other.versions.iter() {
|
||||
if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) {
|
||||
// Version is already present, check consistency
|
||||
if v1 != v2 {
|
||||
error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version);
|
||||
}
|
||||
} else if self.versions.iter().all(|v| v.version != v2.version - 1) {
|
||||
error!(
|
||||
"Cannot receive new layout version {}, version {} is missing",
|
||||
v2.version,
|
||||
v2.version - 1
|
||||
);
|
||||
} else {
|
||||
self.versions.push(v2.clone());
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Merge trackers
|
||||
let c = self.update_trackers.merge(&other.update_trackers);
|
||||
changed = changed || c;
|
||||
|
||||
// Merge staged layout changes
|
||||
if self.staging != other.staging {
|
||||
let prev_staging = self.staging.clone();
|
||||
self.staging.merge(&other.staging);
|
||||
changed = changed || self.staging != prev_staging;
|
||||
}
|
||||
|
||||
changed
|
||||
}
|
||||
|
||||
pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> {
|
||||
match version {
|
||||
None => {
|
||||
let error = r#"
|
||||
Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
|
||||
To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
|
||||
"#;
|
||||
return Err(Error::Message(error.into()));
|
||||
}
|
||||
Some(v) => {
|
||||
if v != self.current().version + 1 {
|
||||
return Err(Error::Message("Invalid new layout version".into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute new version and add it to history
|
||||
let (new_version, msg) = self
|
||||
.current()
|
||||
.clone()
|
||||
.calculate_next_version(self.staging.get())?;
|
||||
|
||||
self.versions.push(new_version);
|
||||
self.cleanup_old_versions();
|
||||
|
||||
// Reset the staged layout changes
|
||||
self.staging.update(LayoutStaging {
|
||||
parameters: self.staging.get().parameters.clone(),
|
||||
roles: LwwMap::new(),
|
||||
});
|
||||
|
||||
Ok((self, msg))
|
||||
}
|
||||
|
||||
pub fn revert_staged_changes(mut self) -> Result<Self, Error> {
|
||||
self.staging.update(LayoutStaging {
|
||||
parameters: Lww::new(self.current().parameters),
|
||||
roles: LwwMap::new(),
|
||||
});
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
pub fn check(&self) -> Result<(), String> {
|
||||
// TODO: anything more ?
|
||||
self.current().check()
|
||||
}
|
||||
}
|
381
src/rpc/layout/manager.rs
Normal file
381
src/rpc/layout/manager.rs
Normal file
|
@ -0,0 +1,381 @@
|
|||
use std::collections::HashMap;
|
||||
use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard};
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use garage_net::endpoint::Endpoint;
|
||||
use garage_net::peering::PeeringManager;
|
||||
use garage_net::NodeID;
|
||||
|
||||
use garage_util::config::Config;
|
||||
use garage_util::data::*;
|
||||
use garage_util::error::*;
|
||||
use garage_util::persister::Persister;
|
||||
|
||||
use super::*;
|
||||
use crate::replication_mode::*;
|
||||
use crate::rpc_helper::*;
|
||||
use crate::system::*;
|
||||
|
||||
pub struct LayoutManager {
|
||||
node_id: Uuid,
|
||||
replication_factor: ReplicationFactor,
|
||||
persist_cluster_layout: Persister<LayoutHistory>,
|
||||
|
||||
layout: Arc<RwLock<LayoutHelper>>,
|
||||
pub(crate) change_notify: Arc<Notify>,
|
||||
|
||||
table_sync_version: Mutex<HashMap<String, u64>>,
|
||||
|
||||
pub(crate) rpc_helper: RpcHelper,
|
||||
system_endpoint: Arc<Endpoint<SystemRpc, System>>,
|
||||
}
|
||||
|
||||
impl LayoutManager {
|
||||
pub fn new(
|
||||
config: &Config,
|
||||
node_id: NodeID,
|
||||
system_endpoint: Arc<Endpoint<SystemRpc, System>>,
|
||||
peering: Arc<PeeringManager>,
|
||||
replication_factor: ReplicationFactor,
|
||||
consistency_mode: ConsistencyMode,
|
||||
) -> Result<Arc<Self>, Error> {
|
||||
let persist_cluster_layout: Persister<LayoutHistory> =
|
||||
Persister::new(&config.metadata_dir, "cluster_layout");
|
||||
|
||||
let cluster_layout = match persist_cluster_layout.load() {
|
||||
Ok(x) => {
|
||||
if x.current().replication_factor != replication_factor.replication_factor() {
|
||||
return Err(Error::Message(format!(
|
||||
"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
|
||||
x.current().replication_factor,
|
||||
replication_factor.replication_factor()
|
||||
)));
|
||||
}
|
||||
x
|
||||
}
|
||||
Err(e) => {
|
||||
info!(
|
||||
"No valid previous cluster layout stored ({}), starting fresh.",
|
||||
e
|
||||
);
|
||||
LayoutHistory::new(replication_factor)
|
||||
}
|
||||
};
|
||||
|
||||
let mut cluster_layout = LayoutHelper::new(
|
||||
replication_factor,
|
||||
consistency_mode,
|
||||
cluster_layout,
|
||||
Default::default(),
|
||||
);
|
||||
cluster_layout.update_update_trackers(node_id.into());
|
||||
|
||||
let layout = Arc::new(RwLock::new(cluster_layout));
|
||||
let change_notify = Arc::new(Notify::new());
|
||||
|
||||
let rpc_helper = RpcHelper::new(
|
||||
node_id.into(),
|
||||
peering,
|
||||
layout.clone(),
|
||||
config.rpc_timeout_msec.map(Duration::from_millis),
|
||||
);
|
||||
|
||||
Ok(Arc::new(Self {
|
||||
node_id: node_id.into(),
|
||||
replication_factor,
|
||||
persist_cluster_layout,
|
||||
layout,
|
||||
change_notify,
|
||||
table_sync_version: Mutex::new(HashMap::new()),
|
||||
system_endpoint,
|
||||
rpc_helper,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---- PUBLIC INTERFACE ----
|
||||
|
||||
pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHelper> {
|
||||
self.layout.read().unwrap()
|
||||
}
|
||||
|
||||
pub async fn update_cluster_layout(
|
||||
self: &Arc<Self>,
|
||||
layout: &LayoutHistory,
|
||||
) -> Result<(), Error> {
|
||||
self.handle_advertise_cluster_layout(layout).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_table(&self, table_name: &'static str) {
|
||||
let first_version = self.layout().versions().first().unwrap().version;
|
||||
|
||||
self.table_sync_version
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(table_name.to_string(), first_version);
|
||||
}
|
||||
|
||||
pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) {
|
||||
let mut table_sync_version = self.table_sync_version.lock().unwrap();
|
||||
*table_sync_version.get_mut(table_name).unwrap() = version;
|
||||
let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap();
|
||||
drop(table_sync_version);
|
||||
|
||||
let mut layout = self.layout.write().unwrap();
|
||||
if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) {
|
||||
info!("sync_until updated to {}", sync_until);
|
||||
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
|
||||
layout.inner().update_trackers.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
fn ack_new_version(self: &Arc<Self>) {
|
||||
let mut layout = self.layout.write().unwrap();
|
||||
if layout.update_ack_to_max_free(self.node_id) {
|
||||
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
|
||||
layout.inner().update_trackers.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// ---- ACK LOCKING ----
|
||||
|
||||
pub fn write_sets_of(self: &Arc<Self>, position: &Hash) -> WriteLock<Vec<Vec<Uuid>>> {
|
||||
let layout = self.layout();
|
||||
let version = layout.current().version;
|
||||
let nodes = layout.storage_sets_of(position);
|
||||
layout
|
||||
.ack_lock
|
||||
.get(&version)
|
||||
.unwrap()
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
WriteLock::new(version, self, nodes)
|
||||
}
|
||||
|
||||
// ---- INTERNALS ---
|
||||
|
||||
fn merge_layout(&self, adv: &LayoutHistory) -> Option<LayoutHistory> {
|
||||
let mut layout = self.layout.write().unwrap();
|
||||
let prev_digest = layout.digest();
|
||||
let prev_layout_check = layout.is_check_ok();
|
||||
|
||||
if !prev_layout_check || adv.check().is_ok() {
|
||||
if layout.update(|l| l.merge(adv)) {
|
||||
layout.update_update_trackers(self.node_id);
|
||||
if prev_layout_check && !layout.is_check_ok() {
|
||||
panic!("Merged two correct layouts and got an incorrect layout.");
|
||||
}
|
||||
assert!(layout.digest() != prev_digest);
|
||||
return Some(layout.inner().clone());
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option<UpdateTrackers> {
|
||||
let mut layout = self.layout.write().unwrap();
|
||||
let prev_digest = layout.digest();
|
||||
|
||||
if layout.inner().update_trackers != *adv {
|
||||
if layout.update(|l| l.update_trackers.merge(adv)) {
|
||||
layout.update_update_trackers(self.node_id);
|
||||
assert!(layout.digest() != prev_digest);
|
||||
return Some(layout.inner().update_trackers.clone());
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
async fn pull_cluster_layout(self: &Arc<Self>, peer: Uuid) {
|
||||
let resp = self
|
||||
.rpc_helper
|
||||
.call(
|
||||
&self.system_endpoint,
|
||||
peer,
|
||||
SystemRpc::PullClusterLayout,
|
||||
RequestStrategy::with_priority(PRIO_HIGH),
|
||||
)
|
||||
.await;
|
||||
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
|
||||
if let Err(e) = self.handle_advertise_cluster_layout(&layout).await {
|
||||
warn!("In pull_cluster_layout: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn pull_cluster_layout_trackers(self: &Arc<Self>, peer: Uuid) {
|
||||
let resp = self
|
||||
.rpc_helper
|
||||
.call(
|
||||
&self.system_endpoint,
|
||||
peer,
|
||||
SystemRpc::PullClusterLayoutTrackers,
|
||||
RequestStrategy::with_priority(PRIO_HIGH),
|
||||
)
|
||||
.await;
|
||||
if let Ok(SystemRpc::AdvertiseClusterLayoutTrackers(trackers)) = resp {
|
||||
if let Err(e) = self
|
||||
.handle_advertise_cluster_layout_trackers(&trackers)
|
||||
.await
|
||||
{
|
||||
warn!("In pull_cluster_layout_trackers: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Save cluster layout data to disk
|
||||
async fn save_cluster_layout(&self) -> Result<(), Error> {
|
||||
let layout = self.layout.read().unwrap().inner().clone();
|
||||
self.persist_cluster_layout
|
||||
.save_async(&layout)
|
||||
.await
|
||||
.expect("Cannot save current cluster layout");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn broadcast_update(self: &Arc<Self>, rpc: SystemRpc) {
|
||||
tokio::spawn({
|
||||
let this = self.clone();
|
||||
async move {
|
||||
if let Err(e) = this
|
||||
.rpc_helper
|
||||
.broadcast(
|
||||
&this.system_endpoint,
|
||||
rpc,
|
||||
RequestStrategy::with_priority(PRIO_HIGH),
|
||||
)
|
||||
.await
|
||||
{
|
||||
warn!("Error while broadcasting new cluster layout: {}", e);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// ---- RPC HANDLERS ----
|
||||
|
||||
pub(crate) fn handle_advertise_status(self: &Arc<Self>, from: Uuid, remote: &RpcLayoutDigest) {
|
||||
let local = self.layout().digest();
|
||||
if remote.current_version > local.current_version
|
||||
|| remote.active_versions != local.active_versions
|
||||
|| remote.staging_hash != local.staging_hash
|
||||
{
|
||||
tokio::spawn({
|
||||
let this = self.clone();
|
||||
async move { this.pull_cluster_layout(from).await }
|
||||
});
|
||||
} else if remote.trackers_hash != local.trackers_hash {
|
||||
tokio::spawn({
|
||||
let this = self.clone();
|
||||
async move { this.pull_cluster_layout_trackers(from).await }
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc {
|
||||
let layout = self.layout.read().unwrap().inner().clone();
|
||||
SystemRpc::AdvertiseClusterLayout(layout)
|
||||
}
|
||||
|
||||
pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc {
|
||||
let layout = self.layout.read().unwrap();
|
||||
SystemRpc::AdvertiseClusterLayoutTrackers(layout.inner().update_trackers.clone())
|
||||
}
|
||||
|
||||
pub(crate) async fn handle_advertise_cluster_layout(
|
||||
self: &Arc<Self>,
|
||||
adv: &LayoutHistory,
|
||||
) -> Result<SystemRpc, Error> {
|
||||
debug!(
|
||||
"handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}",
|
||||
adv.versions.len(),
|
||||
adv.current().version,
|
||||
adv.update_trackers
|
||||
);
|
||||
|
||||
if adv.current().replication_factor != self.replication_factor.replication_factor() {
|
||||
let msg = format!(
|
||||
"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
|
||||
adv.current().replication_factor,
|
||||
self.replication_factor.replication_factor()
|
||||
);
|
||||
error!("{}", msg);
|
||||
return Err(Error::Message(msg));
|
||||
}
|
||||
|
||||
if let Some(new_layout) = self.merge_layout(adv) {
|
||||
debug!("handle_advertise_cluster_layout: some changes were added to the current stuff");
|
||||
|
||||
self.change_notify.notify_waiters();
|
||||
self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout));
|
||||
self.save_cluster_layout().await?;
|
||||
}
|
||||
|
||||
Ok(SystemRpc::Ok)
|
||||
}
|
||||
|
||||
pub(crate) async fn handle_advertise_cluster_layout_trackers(
|
||||
self: &Arc<Self>,
|
||||
trackers: &UpdateTrackers,
|
||||
) -> Result<SystemRpc, Error> {
|
||||
debug!("handle_advertise_cluster_layout_trackers: {:?}", trackers);
|
||||
|
||||
if let Some(new_trackers) = self.merge_layout_trackers(trackers) {
|
||||
self.change_notify.notify_waiters();
|
||||
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers));
|
||||
self.save_cluster_layout().await?;
|
||||
}
|
||||
|
||||
Ok(SystemRpc::Ok)
|
||||
}
|
||||
}
|
||||
|
||||
// ---- ack lock ----
|
||||
|
||||
pub struct WriteLock<T> {
|
||||
layout_version: u64,
|
||||
layout_manager: Arc<LayoutManager>,
|
||||
value: T,
|
||||
}
|
||||
|
||||
impl<T> WriteLock<T> {
|
||||
fn new(version: u64, layout_manager: &Arc<LayoutManager>, value: T) -> Self {
|
||||
Self {
|
||||
layout_version: version,
|
||||
layout_manager: layout_manager.clone(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AsRef<T> for WriteLock<T> {
|
||||
fn as_ref(&self) -> &T {
|
||||
&self.value
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AsMut<T> for WriteLock<T> {
|
||||
fn as_mut(&mut self) -> &mut T {
|
||||
&mut self.value
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for WriteLock<T> {
|
||||
fn drop(&mut self) {
|
||||
let layout = self.layout_manager.layout(); // acquire read lock
|
||||
if let Some(counter) = layout.ack_lock.get(&self.layout_version) {
|
||||
let prev_lock = counter.fetch_sub(1, Ordering::Relaxed);
|
||||
if prev_lock == 1 && layout.current().version > self.layout_version {
|
||||
drop(layout); // release read lock, write lock will be acquired
|
||||
self.layout_manager.ack_new_version();
|
||||
}
|
||||
} else {
|
||||
error!("Could not find ack lock counter for layout version {}. This probably indicates a bug in Garage.", self.layout_version);
|
||||
}
|
||||
}
|
||||
}
|
478
src/rpc/layout/mod.rs
Normal file
478
src/rpc/layout/mod.rs
Normal file
|
@ -0,0 +1,478 @@
|
|||
use std::fmt;
|
||||
|
||||
use bytesize::ByteSize;
|
||||
|
||||
use garage_util::crdt::{AutoCrdt, Crdt};
|
||||
use garage_util::data::Uuid;
|
||||
|
||||
mod graph_algo;
|
||||
mod helper;
|
||||
mod history;
|
||||
mod version;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
pub mod manager;
|
||||
|
||||
// ---- re-exports ----
|
||||
|
||||
pub use helper::{LayoutHelper, RpcLayoutDigest, SyncLayoutDigest};
|
||||
pub use manager::WriteLock;
|
||||
pub use version::*;
|
||||
|
||||
// ---- defines: partitions ----
|
||||
|
||||
/// A partition id, which is stored on 16 bits
|
||||
/// i.e. we have up to 2**16 partitions.
|
||||
/// (in practice we have exactly 2**PARTITION_BITS partitions)
|
||||
pub type Partition = u16;
|
||||
|
||||
// TODO: make this constant parametrizable in the config file
|
||||
// For deployments with many nodes it might make sense to bump
|
||||
// it up to 10.
|
||||
// Maximum value : 16
|
||||
/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
|
||||
/// presence of numerous nodes, but exponentially bigger ring. Max 16
|
||||
pub const PARTITION_BITS: usize = 8;
|
||||
|
||||
const NB_PARTITIONS: usize = 1usize << PARTITION_BITS;
|
||||
|
||||
// ---- defines: nodes ----
|
||||
|
||||
// Type to store compactly the id of a node in the system
|
||||
// Change this to u16 the day we want to have more than 256 nodes in a cluster
|
||||
pub type CompactNodeType = u8;
|
||||
pub const MAX_NODE_NUMBER: usize = 256;
|
||||
|
||||
// ======== actual data structures for the layout data ========
|
||||
// ======== that is persisted to disk ========
|
||||
// some small utility impls are at the end of this file,
|
||||
// but most of the code that actually computes stuff is in
|
||||
// version.rs, history.rs and helper.rs
|
||||
|
||||
mod v08 {
|
||||
use crate::layout::CompactNodeType;
|
||||
use garage_util::crdt::LwwMap;
|
||||
use garage_util::data::{Hash, Uuid};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// The layout of the cluster, i.e. the list of roles
|
||||
/// which are assigned to each cluster node
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ClusterLayout {
|
||||
pub version: u64,
|
||||
|
||||
pub replication_factor: usize,
|
||||
pub roles: LwwMap<Uuid, NodeRoleV>,
|
||||
|
||||
// see comments in v010::ClusterLayout
|
||||
pub node_id_vec: Vec<Uuid>,
|
||||
#[serde(with = "serde_bytes")]
|
||||
pub ring_assignation_data: Vec<CompactNodeType>,
|
||||
|
||||
/// Role changes which are staged for the next version of the layout
|
||||
pub staging: LwwMap<Uuid, NodeRoleV>,
|
||||
pub staging_hash: Hash,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct NodeRoleV(pub Option<NodeRole>);
|
||||
|
||||
/// The user-assigned roles of cluster nodes
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct NodeRole {
|
||||
/// Datacenter at which this entry belong. This information is used to
|
||||
/// perform a better geodistribution
|
||||
pub zone: String,
|
||||
/// The capacity of the node
|
||||
/// If this is set to None, the node does not participate in storing data for the system
|
||||
/// and is only active as an API gateway to other nodes
|
||||
pub capacity: Option<u64>,
|
||||
/// A set of tags to recognize the node
|
||||
pub tags: Vec<String>,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::InitialFormat for ClusterLayout {}
|
||||
}
|
||||
|
||||
mod v09 {
|
||||
use super::v08;
|
||||
use crate::layout::CompactNodeType;
|
||||
use garage_util::crdt::{Lww, LwwMap};
|
||||
use garage_util::data::{Hash, Uuid};
|
||||
use serde::{Deserialize, Serialize};
|
||||
pub use v08::{NodeRole, NodeRoleV};
|
||||
|
||||
/// The layout of the cluster, i.e. the list of roles
|
||||
/// which are assigned to each cluster node
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ClusterLayout {
|
||||
pub version: u64,
|
||||
|
||||
pub replication_factor: usize,
|
||||
|
||||
/// This attribute is only used to retain the previously computed partition size,
|
||||
/// to know to what extent does it change with the layout update.
|
||||
pub partition_size: u64,
|
||||
/// Parameters used to compute the assignment currently given by
|
||||
/// ring_assignment_data
|
||||
pub parameters: LayoutParameters,
|
||||
|
||||
pub roles: LwwMap<Uuid, NodeRoleV>,
|
||||
|
||||
// see comments in v010::ClusterLayout
|
||||
pub node_id_vec: Vec<Uuid>,
|
||||
#[serde(with = "serde_bytes")]
|
||||
pub ring_assignment_data: Vec<CompactNodeType>,
|
||||
|
||||
/// Parameters to be used in the next partition assignment computation.
|
||||
pub staging_parameters: Lww<LayoutParameters>,
|
||||
/// Role changes which are staged for the next version of the layout
|
||||
pub staging_roles: LwwMap<Uuid, NodeRoleV>,
|
||||
pub staging_hash: Hash,
|
||||
}
|
||||
|
||||
/// This struct is used to set the parameters to be used in the assignment computation
|
||||
/// algorithm. It is stored as a Crdt.
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct LayoutParameters {
|
||||
pub zone_redundancy: ZoneRedundancy,
|
||||
}
|
||||
|
||||
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
|
||||
/// of each partition on at least that number of different zones.
|
||||
/// Otherwise, copies will be stored on the maximum possible number of zones.
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub enum ZoneRedundancy {
|
||||
AtLeast(usize),
|
||||
Maximum,
|
||||
}
|
||||
|
||||
impl garage_util::migrate::Migrate for ClusterLayout {
|
||||
const VERSION_MARKER: &'static [u8] = b"G09layout";
|
||||
|
||||
type Previous = v08::ClusterLayout;
|
||||
|
||||
fn migrate(previous: Self::Previous) -> Self {
|
||||
use itertools::Itertools;
|
||||
|
||||
// In the old layout, capacities are in an arbitrary unit,
|
||||
// but in the new layout they are in bytes.
|
||||
// Here we arbitrarily multiply everything by 1G,
|
||||
// such that 1 old capacity unit = 1GB in the new units.
|
||||
// This is totally arbitrary and won't work for most users.
|
||||
let cap_mul = 1024 * 1024 * 1024;
|
||||
let roles = multiply_all_capacities(previous.roles, cap_mul);
|
||||
let staging_roles = multiply_all_capacities(previous.staging, cap_mul);
|
||||
let node_id_vec = previous.node_id_vec;
|
||||
|
||||
// Determine partition size
|
||||
let mut tmp = previous.ring_assignation_data.clone();
|
||||
tmp.sort();
|
||||
let partition_size = tmp
|
||||
.into_iter()
|
||||
.dedup_with_count()
|
||||
.map(|(npart, node)| {
|
||||
roles
|
||||
.get(&node_id_vec[node as usize])
|
||||
.and_then(|p| p.0.as_ref().and_then(|r| r.capacity))
|
||||
.unwrap_or(0) / npart as u64
|
||||
})
|
||||
.min()
|
||||
.unwrap_or(0);
|
||||
|
||||
// By default, zone_redundancy is maximum possible value
|
||||
let parameters = LayoutParameters {
|
||||
zone_redundancy: ZoneRedundancy::Maximum,
|
||||
};
|
||||
|
||||
Self {
|
||||
version: previous.version,
|
||||
replication_factor: previous.replication_factor,
|
||||
partition_size,
|
||||
parameters,
|
||||
roles,
|
||||
node_id_vec,
|
||||
ring_assignment_data: previous.ring_assignation_data,
|
||||
staging_parameters: Lww::new(parameters),
|
||||
staging_roles,
|
||||
staging_hash: [0u8; 32].into(), // will be set in the next migration
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn multiply_all_capacities(
|
||||
old_roles: LwwMap<Uuid, NodeRoleV>,
|
||||
mul: u64,
|
||||
) -> LwwMap<Uuid, NodeRoleV> {
|
||||
let mut new_roles = LwwMap::new();
|
||||
for (node, ts, role) in old_roles.items() {
|
||||
let mut role = role.clone();
|
||||
if let NodeRoleV(Some(NodeRole {
|
||||
capacity: Some(ref mut cap),
|
||||
..
|
||||
})) = role
|
||||
{
|
||||
*cap *= mul;
|
||||
}
|
||||
new_roles.merge_raw(node, *ts, &role);
|
||||
}
|
||||
new_roles
|
||||
}
|
||||
}
|
||||
|
||||
mod v010 {
|
||||
use super::v09;
|
||||
use crate::layout::CompactNodeType;
|
||||
use garage_util::crdt::{Lww, LwwMap};
|
||||
use garage_util::data::Uuid;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy};
|
||||
|
||||
/// Number of old (non-live) versions to keep, see LayoutHistory::old_versions
|
||||
pub const OLD_VERSION_COUNT: usize = 5;
|
||||
|
||||
/// The history of cluster layouts, with trackers to keep a record
|
||||
/// of which nodes are up-to-date to current cluster data
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
|
||||
pub struct LayoutHistory {
|
||||
/// The versions currently in use in the cluster
|
||||
pub versions: Vec<LayoutVersion>,
|
||||
/// At most 5 of the previous versions, not used by the garage_table
|
||||
/// module, but usefull for the garage_block module to find data blocks
|
||||
/// that have not yet been moved
|
||||
pub old_versions: Vec<LayoutVersion>,
|
||||
|
||||
/// Update trackers
|
||||
pub update_trackers: UpdateTrackers,
|
||||
|
||||
/// Staged changes for the next version
|
||||
pub staging: Lww<LayoutStaging>,
|
||||
}
|
||||
|
||||
/// A version of the layout of the cluster, i.e. the list of roles
|
||||
/// which are assigned to each cluster node
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
|
||||
pub struct LayoutVersion {
|
||||
/// The number of this version
|
||||
pub version: u64,
|
||||
|
||||
/// Roles assigned to nodes in this version
|
||||
pub roles: LwwMap<Uuid, NodeRoleV>,
|
||||
/// Parameters used to compute the assignment currently given by
|
||||
/// ring_assignment_data
|
||||
pub parameters: LayoutParameters,
|
||||
|
||||
/// The number of replicas for each data partition
|
||||
pub replication_factor: usize,
|
||||
/// This attribute is only used to retain the previously computed partition size,
|
||||
/// to know to what extent does it change with the layout update.
|
||||
pub partition_size: u64,
|
||||
|
||||
/// node_id_vec: a vector of node IDs with a role assigned
|
||||
/// in the system (this includes gateway nodes).
|
||||
/// The order here is different than the vec stored by `roles`, because:
|
||||
/// 1. non-gateway nodes are first so that they have lower numbers
|
||||
/// 2. nodes that don't have a role are excluded (but they need to
|
||||
/// stay in the CRDT as tombstones)
|
||||
pub node_id_vec: Vec<Uuid>,
|
||||
/// number of non-gateway nodes, which are the first ids in node_id_vec
|
||||
pub nongateway_node_count: usize,
|
||||
/// The assignation of data partitions to nodes, the values
|
||||
/// are indices in node_id_vec
|
||||
#[serde(with = "serde_bytes")]
|
||||
pub ring_assignment_data: Vec<CompactNodeType>,
|
||||
}
|
||||
|
||||
/// The staged changes for the next layout version
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
|
||||
pub struct LayoutStaging {
|
||||
/// Parameters to be used in the next partition assignment computation.
|
||||
pub parameters: Lww<LayoutParameters>,
|
||||
/// Role changes which are staged for the next version of the layout
|
||||
pub roles: LwwMap<Uuid, NodeRoleV>,
|
||||
}
|
||||
|
||||
/// The tracker of acknowlegments and data syncs around the cluster
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
|
||||
pub struct UpdateTrackers {
|
||||
/// The highest layout version number each node has ack'ed
|
||||
pub ack_map: UpdateTracker,
|
||||
/// The highest layout version number each node has synced data for
|
||||
pub sync_map: UpdateTracker,
|
||||
/// The highest layout version number each node has
|
||||
/// ack'ed that all other nodes have synced data for
|
||||
pub sync_ack_map: UpdateTracker,
|
||||
}
|
||||
|
||||
/// Generic update tracker struct
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
|
||||
pub struct UpdateTracker(pub BTreeMap<Uuid, u64>);
|
||||
|
||||
impl garage_util::migrate::Migrate for LayoutHistory {
|
||||
const VERSION_MARKER: &'static [u8] = b"G010lh";
|
||||
|
||||
type Previous = v09::ClusterLayout;
|
||||
|
||||
fn migrate(previous: Self::Previous) -> Self {
|
||||
let nongateway_node_count = previous
|
||||
.node_id_vec
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, uuid)| {
|
||||
let role = previous.roles.get(uuid);
|
||||
matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some())
|
||||
})
|
||||
.map(|(i, _)| i + 1)
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
let version = LayoutVersion {
|
||||
version: previous.version,
|
||||
replication_factor: previous.replication_factor,
|
||||
partition_size: previous.partition_size,
|
||||
parameters: previous.parameters,
|
||||
roles: previous.roles,
|
||||
node_id_vec: previous.node_id_vec,
|
||||
nongateway_node_count,
|
||||
ring_assignment_data: previous.ring_assignment_data,
|
||||
};
|
||||
let update_tracker = UpdateTracker(
|
||||
version
|
||||
.nongateway_nodes()
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|x| (x, version.version))
|
||||
.collect::<BTreeMap<Uuid, u64>>(),
|
||||
);
|
||||
let staging = LayoutStaging {
|
||||
parameters: previous.staging_parameters,
|
||||
roles: previous.staging_roles,
|
||||
};
|
||||
Self {
|
||||
versions: vec![version],
|
||||
old_versions: vec![],
|
||||
update_trackers: UpdateTrackers {
|
||||
ack_map: update_tracker.clone(),
|
||||
sync_map: update_tracker.clone(),
|
||||
sync_ack_map: update_tracker,
|
||||
},
|
||||
staging: Lww::raw(previous.version, staging),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub use v010::*;
|
||||
|
||||
// ---- utility functions ----
|
||||
|
||||
impl AutoCrdt for LayoutParameters {
|
||||
const WARN_IF_DIFFERENT: bool = true;
|
||||
}
|
||||
|
||||
impl AutoCrdt for NodeRoleV {
|
||||
const WARN_IF_DIFFERENT: bool = true;
|
||||
}
|
||||
|
||||
impl Crdt for LayoutStaging {
|
||||
fn merge(&mut self, other: &LayoutStaging) {
|
||||
self.parameters.merge(&other.parameters);
|
||||
self.roles.merge(&other.roles);
|
||||
}
|
||||
}
|
||||
|
||||
impl NodeRole {
|
||||
pub fn capacity_string(&self) -> String {
|
||||
match self.capacity {
|
||||
Some(c) => ByteSize::b(c).to_string_as(false),
|
||||
None => "gateway".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tags_string(&self) -> String {
|
||||
self.tags.join(",")
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZoneRedundancy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
ZoneRedundancy::Maximum => write!(f, "maximum"),
|
||||
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl core::str::FromStr for ZoneRedundancy {
|
||||
type Err = &'static str;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
|
||||
x => {
|
||||
let v = x
|
||||
.parse::<usize>()
|
||||
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
|
||||
Ok(ZoneRedundancy::AtLeast(v))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UpdateTracker {
|
||||
fn merge(&mut self, other: &UpdateTracker) -> bool {
|
||||
let mut changed = false;
|
||||
for (k, v) in other.0.iter() {
|
||||
if let Some(v_mut) = self.0.get_mut(k) {
|
||||
if *v > *v_mut {
|
||||
*v_mut = *v;
|
||||
changed = true;
|
||||
}
|
||||
} else {
|
||||
self.0.insert(*k, *v);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
changed
|
||||
}
|
||||
|
||||
/// This bumps the update tracker for a given node up to the specified value.
|
||||
/// This has potential impacts on the correctness of Garage and should only
|
||||
/// be used in very specific circumstances.
|
||||
pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool {
|
||||
match self.0.get_mut(&peer) {
|
||||
Some(e) if *e < value => {
|
||||
*e = value;
|
||||
true
|
||||
}
|
||||
None => {
|
||||
self.0.insert(peer, value);
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 {
|
||||
storage_nodes
|
||||
.iter()
|
||||
.map(|x| self.get(x, min_version))
|
||||
.min()
|
||||
.unwrap_or(min_version)
|
||||
}
|
||||
|
||||
pub fn get(&self, node: &Uuid, min_version: u64) -> u64 {
|
||||
self.0.get(node).copied().unwrap_or(min_version)
|
||||
}
|
||||
}
|
||||
|
||||
impl UpdateTrackers {
|
||||
pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool {
|
||||
let c1 = self.ack_map.merge(&other.ack_map);
|
||||
let c2 = self.sync_map.merge(&other.sync_map);
|
||||
let c3 = self.sync_ack_map.merge(&other.sync_ack_map);
|
||||
c1 || c2 || c3
|
||||
}
|
||||
}
|
158
src/rpc/layout/test.rs
Normal file
158
src/rpc/layout/test.rs
Normal file
|
@ -0,0 +1,158 @@
|
|||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use garage_util::crdt::Crdt;
|
||||
use garage_util::error::*;
|
||||
|
||||
use crate::layout::*;
|
||||
use crate::replication_mode::ReplicationFactor;
|
||||
|
||||
// This function checks that the partition size S computed is at least better than the
|
||||
// one given by a very naive algorithm. To do so, we try to run the naive algorithm
|
||||
// assuming a partion size of S+1. If we succed, it means that the optimal assignment
|
||||
// was not optimal. The naive algorithm is the following :
|
||||
// - we compute the max number of partitions associated to every node, capped at the
|
||||
// partition number. It gives the number of tokens of every node.
|
||||
// - every zone has a number of tokens equal to the sum of the tokens of its nodes.
|
||||
// - we cycle over the partitions and associate zone tokens while respecting the
|
||||
// zone redundancy constraint.
|
||||
// NOTE: the naive algorithm is not optimal. Counter example:
|
||||
// take nb_partition = 3 ; replication_factor = 5; redundancy = 4;
|
||||
// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2)
|
||||
// With these parameters, the naive algo fails, whereas there is a solution:
|
||||
// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E)
|
||||
fn check_against_naive(cl: &LayoutVersion) -> Result<bool, Error> {
|
||||
let over_size = cl.partition_size + 1;
|
||||
let mut zone_token = HashMap::<String, usize>::new();
|
||||
|
||||
let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?;
|
||||
|
||||
if zones.is_empty() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
for z in zones.iter() {
|
||||
zone_token.insert(z.clone(), 0);
|
||||
}
|
||||
for uuid in cl.nongateway_nodes() {
|
||||
let z = cl.expect_get_node_zone(&uuid);
|
||||
let c = cl.expect_get_node_capacity(&uuid);
|
||||
zone_token.insert(
|
||||
z.to_string(),
|
||||
zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize),
|
||||
);
|
||||
}
|
||||
|
||||
// For every partition, we count the number of zone already associated and
|
||||
// the name of the last zone associated
|
||||
|
||||
let mut id_zone_token = vec![0; zones.len()];
|
||||
for (z, t) in zone_token.iter() {
|
||||
id_zone_token[zone_to_id[z]] = *t;
|
||||
}
|
||||
|
||||
let mut nb_token = vec![0; NB_PARTITIONS];
|
||||
let mut last_zone = vec![zones.len(); NB_PARTITIONS];
|
||||
|
||||
let mut curr_zone = 0;
|
||||
|
||||
let redundancy = cl.effective_zone_redundancy();
|
||||
|
||||
for replic in 0..cl.replication_factor {
|
||||
for p in 0..NB_PARTITIONS {
|
||||
while id_zone_token[curr_zone] == 0
|
||||
|| (last_zone[p] == curr_zone
|
||||
&& redundancy - nb_token[p] <= cl.replication_factor - replic)
|
||||
{
|
||||
curr_zone += 1;
|
||||
if curr_zone >= zones.len() {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
id_zone_token[curr_zone] -= 1;
|
||||
if last_zone[p] != curr_zone {
|
||||
nb_token[p] += 1;
|
||||
last_zone[p] = curr_zone;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
fn show_msg(msg: &Message) {
|
||||
for s in msg.iter() {
|
||||
println!("{}", s);
|
||||
}
|
||||
}
|
||||
|
||||
fn update_layout(
|
||||
cl: &mut LayoutHistory,
|
||||
node_capacity_vec: &[u64],
|
||||
node_zone_vec: &[&'static str],
|
||||
zone_redundancy: usize,
|
||||
) {
|
||||
let staging = cl.staging.get_mut();
|
||||
|
||||
for (i, (capacity, zone)) in node_capacity_vec
|
||||
.iter()
|
||||
.zip(node_zone_vec.iter())
|
||||
.enumerate()
|
||||
{
|
||||
let node_id = [i as u8; 32].into();
|
||||
|
||||
let update = staging.roles.update_mutator(
|
||||
node_id,
|
||||
NodeRoleV(Some(NodeRole {
|
||||
zone: zone.to_string(),
|
||||
capacity: Some(*capacity),
|
||||
tags: (vec![]),
|
||||
})),
|
||||
);
|
||||
staging.roles.merge(&update);
|
||||
}
|
||||
staging.parameters.update(LayoutParameters {
|
||||
zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_assignment() {
|
||||
let mut node_capacity_vec = vec![4000, 1000, 2000];
|
||||
let mut node_zone_vec = vec!["A", "B", "C"];
|
||||
|
||||
let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap());
|
||||
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
|
||||
let v = cl.current().version;
|
||||
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
|
||||
show_msg(&msg);
|
||||
assert_eq!(cl.check(), Ok(()));
|
||||
assert!(check_against_naive(cl.current()).unwrap());
|
||||
|
||||
node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000];
|
||||
node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"];
|
||||
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2);
|
||||
let v = cl.current().version;
|
||||
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
|
||||
show_msg(&msg);
|
||||
assert_eq!(cl.check(), Ok(()));
|
||||
assert!(check_against_naive(cl.current()).unwrap());
|
||||
|
||||
node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000];
|
||||
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
|
||||
let v = cl.current().version;
|
||||
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
|
||||
show_msg(&msg);
|
||||
assert_eq!(cl.check(), Ok(()));
|
||||
assert!(check_against_naive(cl.current()).unwrap());
|
||||
|
||||
node_capacity_vec = vec![
|
||||
4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000,
|
||||
];
|
||||
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1);
|
||||
let v = cl.current().version;
|
||||
let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
|
||||
show_msg(&msg);
|
||||
assert_eq!(cl.check(), Ok(()));
|
||||
assert!(check_against_naive(cl.current()).unwrap());
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue