Compare commits

...

95 commits

Author SHA1 Message Date
Alex 0038ca8a78
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
2024-03-18 20:19:30 +01:00
Alex 81191d2d92 Merge pull request 'Remove Sled' (#767) from rm-sled into next-0.10
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
Reviewed-on: #767
2024-03-12 10:45:57 +00:00
Alex 2795b53b8b
[rm-sled] factorize some code in sqlite backend
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
2024-03-12 11:15:26 +01:00
Alex 32aa246300
[rm-sled] Make proper use of pinning in LMDB adapter + comment unsafe
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
2024-03-08 17:39:17 +01:00
Alex b942949940
[rm-sled] Implement iterators in sqlite & lmdb transactions
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
with way too much unsafe code
2024-03-08 16:38:01 +01:00
Alex 66c23890c1
[rm-sled] Implement some missing functionality in garage_db
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
2024-03-08 16:02:58 +01:00
Alex 05c92204ec
[rm-sled] Remove counted_tree_hack
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
2024-03-08 15:09:57 +01:00
Alex 2128b5febd Merge pull request 'Remove migration path from Garage v0.5' (#766) from rm-migration into next-0.10
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
Reviewed-on: #766
2024-03-08 13:43:42 +00:00
Alex 44454aac01
[rm-sled] Remove the Sled database engine
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
2024-03-08 14:11:02 +01:00
Alex 1ace34adbb
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
2024-03-08 13:57:10 +01:00
Alex f537f76681
[rm-migration] Remove migration path from Garage v0.5
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
2024-03-08 13:24:47 +01:00
Alex 20c0b4ffb2 Merge pull request 'ReplicationMode -> ConsistencyMode+ReplicationFactor' (#750) from yuka/garage:split-consistency-mode into next-0.10
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
Reviewed-on: #750
2024-03-07 16:32:52 +00:00
Alex 2fd13c7d13 Merge pull request 'SSE-C encryption' (#730) from sse-c into next-0.10
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
Reviewed-on: #730
2024-03-07 15:21:37 +00:00
Alex 3fcb54e3cf
[sse-c] Remove special case for Content-Type header
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
2024-03-07 15:43:48 +01:00
Alex e3333f2ac5
[sse-c] Documentation for SSE-C 2024-03-07 15:43:48 +01:00
Alex fa4878bad6
[sse-c] Testing for SSE-C encryption 2024-03-07 15:43:48 +01:00
Alex 57acc60082
[sse-c] Implement SSE-C encryption 2024-03-07 15:43:47 +01:00
Alex fe2dc5d51c
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
2024-03-07 14:00:34 +01:00
Yureka c1769bbe69 ReplicationMode -> ConsistencyMode+ReplicationFactor
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
2024-03-07 12:45:33 +01:00
Yureka 8f86af52ed adjust docs for replication factor 2024-03-05 22:57:08 +01:00
Alex 603604cdfc Merge pull request 'refactor: remove max_write_errors and max_faults' (#760) from yuka/garage:remove-max-write-errors into next-0.10
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
Reviewed-on: #760
2024-03-05 21:56:17 +00:00
Yureka 6760895926 refactor: remove max_write_errors and max_faults
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
2024-03-04 18:39:56 +01:00
Alex bbde9bc912
Merge branch 'main' into next-0.10
Some checks failed
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline failed
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
2024-03-04 15:56:10 +01:00
Alex d0d95fd53f
[next-0.10] woodpecker: run debug pipeline on manual trigger
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
2024-02-27 10:13:09 +01:00
Alex 4b978b7533
Merge branch 'main' into next-0.10 2024-02-26 18:55:24 +01:00
Alex 3692af7052
Merge branch 'main' into next-0.10 2024-02-23 18:28:05 +01:00
Alex 916c67ccf4
Merge branch 'main' into next-0.10 2024-02-23 16:50:34 +01:00
Alex 81cebdd124
[next-0.10] fix build
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
2024-02-22 15:53:47 +01:00
Alex 59f61c966a
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
2024-02-22 15:45:45 +01:00
Alex 75e591727d
[next-0.10] cluster node status metrics: report nodes of all active layout versions
Some checks failed
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline failed
ci/woodpecker/deployment/release/1 Pipeline failed
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline failed
ci/woodpecker/deployment/release/3 Pipeline failed
ci/woodpecker/deployment/publish unknown status
2024-02-20 17:08:31 +01:00
Alex 643d1aabd8
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
2024-02-20 17:02:44 +01:00
Alex eb4a6ce106
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/pr/debug Pipeline was successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/deployment/debug Pipeline was successful
ci/woodpecker/deployment/release/2 Pipeline was successful
ci/woodpecker/deployment/release/1 Pipeline was successful
ci/woodpecker/deployment/release/3 Pipeline was successful
ci/woodpecker/deployment/release/4 Pipeline was successful
ci/woodpecker/deployment/publish Pipeline was successful
2024-02-15 14:06:34 +01:00
Alex cf2af186fc
Merge branch 'main' into next-0.10
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/pr/debug Pipeline was successful
2024-02-13 11:36:28 +01:00
Alex db48dd3d6c
bump crate versions to 0.10.0
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2024-01-11 12:05:51 +01:00
Alex 8a6ec1d611 Merge pull request 'NLnet task 3' (#667) from nlnet-task3 into next-0.10
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
Reviewed-on: #667
2024-01-11 10:58:08 +00:00
Alex 0041b013a4
layout: refactoring and fix in layout helper
Some checks reported errors
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build was killed
continuous-integration/drone Build is passing
2023-12-11 16:09:22 +01:00
Alex adccce1145
layout: refactor/fix bad while loop
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-11 15:45:14 +01:00
Alex 85b5a6bcd1
fix some clippy lints
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-11 15:31:47 +01:00
Alex e4f493b481
table: remove redundant tracing in insert_many
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-11 14:57:42 +01:00
Alex f8df90b79b
table: fix insert_many to not send duplicates
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-08 14:54:11 +01:00
Alex 4dbf254512
layout: refactoring, merge two files
Some checks reported errors
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build was killed
2023-12-08 14:15:52 +01:00
Alex 64a6e557a4
rpc helper: small refactorings
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-08 12:18:12 +01:00
Alex 5dd200c015
layout: move block_read_nodes_of to rpc_helper to avoid double-locking
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
(in theory, this could have caused a deadlock)
2023-12-08 12:02:24 +01:00
Alex 063294dd56
layout version: refactor get_node_zone
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-12-08 11:50:58 +01:00
Alex 7f2541101f
cli: improvements to the layout commands when multiple layouts are live
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-08 11:24:23 +01:00
Alex 91b874c4ef
rpc: fix system::health
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-12-08 10:36:37 +01:00
Alex 431b28e0cf
fix build with discovery features
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2023-12-07 15:15:59 +01:00
Alex 9cecea64d4
layout: allow sync update tracker to progress with only quorums
Some checks reported errors
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
continuous-integration/drone Build was killed
2023-12-07 14:51:20 +01:00
Alex aa59059a91
layout cli: safer skip-dead-nodes command
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-07 11:56:14 +01:00
Alex d90de365b3
table sync: use write quorums to report global success or failure of sync
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-12-07 11:16:10 +01:00
Alex 95eb13eb08
rpc: refactor result tracking for quorum sets
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-12-07 10:57:21 +01:00
Alex c8356a91d9
layout updates: fix the set of nodes among which minima are calculated
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-12-07 10:30:26 +01:00
Alex c04dd8788a
admin: more info in admin GetClusterStatus
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-11-28 14:25:04 +01:00
Alex 539af6eac4
rpc helper: write comments + small refactoring of tracing
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-11-28 11:12:39 +01:00
Alex c539077d30
cli: remove historic layout info from status
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-11-27 16:22:27 +01:00
Alex 11e6fef93c
cli: add layout history and layout assume-sync commands 2023-11-27 16:22:25 +01:00
Alex 539a920313
cli: show when nodes are draining metadata
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-27 13:18:59 +01:00
Alex 78362140f5
rpc: update system::health to take into account write sets for all partitions
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-11-27 12:10:21 +01:00
Alex d6d239fc79
block manager: read_block using old layout versions if necessary
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-11-27 11:52:57 +01:00
Alex 3ecd14b9f6
table: implement write sets for insert_many
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-11-16 16:41:45 +01:00
Alex 22f38808e7
rpc_helper: don't use tokio::spawn for individual requests 2023-11-16 16:34:01 +01:00
Alex 707442f5de
layout: refactor digests and add "!=" assertions before epidemic bcast
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
continuous-integration/drone Build is passing
2023-11-16 13:51:40 +01:00
Alex ad5c6f779f
layout: split helper in separate file; more precise difference tracking
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-11-16 13:26:43 +01:00
Alex d4df03424f
layout: fix test
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
continuous-integration/drone Build is passing
2023-11-15 15:56:57 +01:00
Alex 33c8a489b0
layou: implement ack locking
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-15 15:40:44 +01:00
Alex 393c4d4515
layout: add helper for cached/external values to centralize recomputation
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-15 14:20:50 +01:00
Alex 65066c7064
layout: wip cache global mins 2023-11-15 13:28:30 +01:00
Alex acd49de9f9
rpc: fix write set quorums
All checks were successful
continuous-integration/drone/pr Build is passing
continuous-integration/drone/push Build is passing
2023-11-15 13:07:42 +01:00
Alex 46007bf01d
integration test: print stdout and stderr on subcommand crash 2023-11-15 12:56:52 +01:00
Alex b3e729f4b8
layout history merge: rm invalid versions when valid versions are added
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-15 12:15:58 +01:00
Alex 7ef2c23120
layout: fix test
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-14 15:45:01 +01:00
Alex 90e1619b1e
table: take into account multiple write sets in inserts
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-14 15:40:46 +01:00
Alex 3b361d2959
layout: prepare for write sets
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-14 14:28:16 +01:00
Alex 866196750f
system: add todo wrt new layout 2023-11-14 13:36:58 +01:00
Alex 83a11374ca
layout: fixes in schema
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-14 13:29:26 +01:00
Alex 1aab1f4e68
layout: refactoring of all_nodes
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-14 13:12:32 +01:00
Alex 8e292e06b3
layout: some refactoring of nongateway nodes
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-14 12:48:38 +01:00
Alex 9a491fa137
layout: fix test
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is passing
2023-11-11 13:10:59 +01:00
Alex df24bb806d
layout/sync: fix bugs and add tracing
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-11 12:44:27 +01:00
Alex ce89d1ddab
table sync: adapt to new layout history
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-11 12:08:32 +01:00
Alex df36cf3099
layout: add helpers to LayoutHistory and prepare integration with Table
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-09 16:32:31 +01:00
Alex 9d95f6f704
layout: fix tracker bugs
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-09 15:52:45 +01:00
Alex bad7cc812e
layout admin: add missing calls to update_hash
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-09 15:42:10 +01:00
Alex 03ebf18830
layout: begin managing the update tracker values
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-09 15:31:59 +01:00
Alex 94caf9c0c1
layout: separate code path for synchronizing update trackers only
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-09 14:53:34 +01:00
Alex bfb1845fdc
layout: refactor to use a RwLock on LayoutHistory
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-09 14:12:05 +01:00
Alex 19ef1ec8e7
layout: more refactoring 2023-11-09 13:34:14 +01:00
Alex 8a2b1dd422
wip: split out layout management from System into separate LayoutManager
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-09 12:55:36 +01:00
Alex 523d2ecb95
layout: use separate CRDT for staged layout changes
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-09 11:19:43 +01:00
Alex 1da0a5676e
bump garage protocol version tag to 0x000A (0.10)
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-08 19:30:58 +01:00
Alex 8dccee3ccf
cluster layout: adapt all uses of ClusterLayout to LayoutHistory
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-08 19:28:36 +01:00
Alex fe9af1dcaa
WIP: garage_rpc: store layout version history 2023-11-08 17:49:06 +01:00
Alex 4a9c94514f
avoid using layout_watch in System directly
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing
2023-11-08 16:41:00 +01:00
Alex 12d1dbfc6b
remove Ring and use ClusterLayout everywhere
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing
2023-11-08 15:41:24 +01:00
Alex 0962313ebd
garage_rpc: reorder functions in layout.rs 2023-11-08 13:13:04 +01:00
106 changed files with 6093 additions and 3466 deletions

View file

@ -5,6 +5,7 @@ when:
- pull_request - pull_request
- deployment - deployment
- cron - cron
- manual
steps: steps:
- name: check formatting - name: check formatting
@ -33,7 +34,6 @@ steps:
- ./result/bin/garage_util-* - ./result/bin/garage_util-*
- ./result/bin/garage_web-* - ./result/bin/garage_web-*
- ./result/bin/garage-* - ./result/bin/garage-*
- GARAGE_TEST_INTEGRATION_DB_ENGINE=sled ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false)
- GARAGE_TEST_INTEGRATION_DB_ENGINE=lmdb ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false) - GARAGE_TEST_INTEGRATION_DB_ENGINE=lmdb ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false)
- GARAGE_TEST_INTEGRATION_DB_ENGINE=sqlite ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false) - GARAGE_TEST_INTEGRATION_DB_ENGINE=sqlite ./result/bin/integration-* || (cat tmp-garage-integration/stderr.log; false)
- rm result - rm result

171
Cargo.lock generated
View file

@ -17,6 +17,41 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aead"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"
dependencies = [
"crypto-common",
"generic-array",
]
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures",
]
[[package]]
name = "aes-gcm"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"
dependencies = [
"aead",
"aes",
"cipher",
"ctr",
"ghash",
"subtle",
]
[[package]] [[package]]
name = "ahash" name = "ahash"
version = "0.8.7" version = "0.8.7"
@ -761,6 +796,16 @@ dependencies = [
"windows-targets 0.52.0", "windows-targets 0.52.0",
] ]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
]
[[package]] [[package]]
name = "clap" name = "clap"
version = "2.34.0" version = "2.34.0"
@ -876,15 +921,6 @@ dependencies = [
"crossbeam-utils", "crossbeam-utils",
] ]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]] [[package]]
name = "crossbeam-queue" name = "crossbeam-queue"
version = "0.3.11" version = "0.3.11"
@ -929,9 +965,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [ dependencies = [
"generic-array", "generic-array",
"rand_core",
"typenum", "typenum",
] ]
[[package]]
name = "ctr"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"
dependencies = [
"cipher",
]
[[package]] [[package]]
name = "darling" name = "darling"
version = "0.20.5" version = "0.20.5"
@ -1167,16 +1213,6 @@ dependencies = [
name = "format_table" name = "format_table"
version = "0.1.1" version = "0.1.1"
[[package]]
name = "fs2"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
dependencies = [
"libc",
"winapi",
]
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.30" version = "0.3.30"
@ -1266,18 +1302,9 @@ dependencies = [
"slab", "slab",
] ]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]] [[package]]
name = "garage" name = "garage"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"assert-json-diff", "assert-json-diff",
"async-trait", "async-trait",
@ -1331,9 +1358,11 @@ dependencies = [
[[package]] [[package]]
name = "garage_api" name = "garage_api"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"aes-gcm",
"argon2", "argon2",
"async-compression",
"async-trait", "async-trait",
"base64 0.21.7", "base64 0.21.7",
"bytes", "bytes",
@ -1374,13 +1403,14 @@ dependencies = [
"sha2", "sha2",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tokio-util 0.7.10",
"tracing", "tracing",
"url", "url",
] ]
[[package]] [[package]]
name = "garage_block" name = "garage_block"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-compression", "async-compression",
@ -1407,7 +1437,7 @@ dependencies = [
[[package]] [[package]]
name = "garage_db" name = "garage_db"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"err-derive", "err-derive",
"heed", "heed",
@ -1416,13 +1446,12 @@ dependencies = [
"r2d2", "r2d2",
"r2d2_sqlite", "r2d2_sqlite",
"rusqlite", "rusqlite",
"sled",
"tracing", "tracing",
] ]
[[package]] [[package]]
name = "garage_model" name = "garage_model"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-trait", "async-trait",
@ -1439,6 +1468,7 @@ dependencies = [
"garage_table", "garage_table",
"garage_util", "garage_util",
"hex", "hex",
"http 1.0.0",
"opentelemetry", "opentelemetry",
"parse_duration", "parse_duration",
"rand", "rand",
@ -1451,7 +1481,7 @@ dependencies = [
[[package]] [[package]]
name = "garage_net" name = "garage_net"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-trait", "async-trait",
@ -1477,7 +1507,7 @@ dependencies = [
[[package]] [[package]]
name = "garage_rpc" name = "garage_rpc"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-trait", "async-trait",
@ -1512,7 +1542,7 @@ dependencies = [
[[package]] [[package]]
name = "garage_table" name = "garage_table"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-trait", "async-trait",
@ -1534,7 +1564,7 @@ dependencies = [
[[package]] [[package]]
name = "garage_util" name = "garage_util"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-trait", "async-trait",
@ -1568,7 +1598,7 @@ dependencies = [
[[package]] [[package]]
name = "garage_web" name = "garage_web"
version = "0.9.3" version = "0.10.0"
dependencies = [ dependencies = [
"err-derive", "err-derive",
"futures", "futures",
@ -1617,6 +1647,16 @@ dependencies = [
"wasi", "wasi",
] ]
[[package]]
name = "ghash"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"
dependencies = [
"opaque-debug",
"polyval",
]
[[package]] [[package]]
name = "gimli" name = "gimli"
version = "0.28.1" version = "0.28.1"
@ -2066,6 +2106,15 @@ dependencies = [
"hashbrown 0.14.3", "hashbrown 0.14.3",
] ]
[[package]]
name = "inout"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
dependencies = [
"generic-array",
]
[[package]] [[package]]
name = "instant" name = "instant"
version = "0.1.12" version = "0.1.12"
@ -2646,6 +2695,12 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "opaque-debug"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
[[package]] [[package]]
name = "openssl-probe" name = "openssl-probe"
version = "0.1.5" version = "0.1.5"
@ -2983,6 +3038,18 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "polyval"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"
dependencies = [
"cfg-if",
"cpufeatures",
"opaque-debug",
"universal-hash",
]
[[package]] [[package]]
name = "powerfmt" name = "powerfmt"
version = "0.2.0" version = "0.2.0"
@ -3769,22 +3836,6 @@ dependencies = [
"autocfg", "autocfg",
] ]
[[package]]
name = "sled"
version = "0.34.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935"
dependencies = [
"crc32fast",
"crossbeam-epoch",
"crossbeam-utils",
"fs2",
"fxhash",
"libc",
"log",
"parking_lot 0.11.2",
]
[[package]] [[package]]
name = "smallvec" name = "smallvec"
version = "1.13.1" version = "1.13.1"
@ -4433,6 +4484,16 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
[[package]]
name = "universal-hash"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"
dependencies = [
"crypto-common",
"subtle",
]
[[package]] [[package]]
name = "unsafe-libyaml" name = "unsafe-libyaml"
version = "0.2.10" version = "0.2.10"

333
Cargo.nix
View file

@ -34,7 +34,7 @@ args@{
ignoreLockHash, ignoreLockHash,
}: }:
let let
nixifiedLockHash = "b35dd31aa882ac6fc7105fb99a6681a4777ed269191cb0a8c8db843910748435"; nixifiedLockHash = "f73523af24b5164222da0a1c326ba65fa4a01b55751dd9ddab251334cfe20d13";
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
lockHashIgnored = if ignoreLockHash lockHashIgnored = if ignoreLockHash
@ -58,17 +58,17 @@ in
{ {
cargo2nixVersion = "0.11.0"; cargo2nixVersion = "0.11.0";
workspace = { workspace = {
garage_db = rustPackages.unknown.garage_db."0.9.3"; garage_db = rustPackages.unknown.garage_db."0.10.0";
garage_util = rustPackages.unknown.garage_util."0.9.3"; garage_util = rustPackages.unknown.garage_util."0.10.0";
garage_net = rustPackages.unknown.garage_net."0.9.3"; garage_net = rustPackages.unknown.garage_net."0.10.0";
garage_rpc = rustPackages.unknown.garage_rpc."0.9.3"; garage_rpc = rustPackages.unknown.garage_rpc."0.10.0";
format_table = rustPackages.unknown.format_table."0.1.1"; format_table = rustPackages.unknown.format_table."0.1.1";
garage_table = rustPackages.unknown.garage_table."0.9.3"; garage_table = rustPackages.unknown.garage_table."0.10.0";
garage_block = rustPackages.unknown.garage_block."0.9.3"; garage_block = rustPackages.unknown.garage_block."0.10.0";
garage_model = rustPackages.unknown.garage_model."0.9.3"; garage_model = rustPackages.unknown.garage_model."0.10.0";
garage_api = rustPackages.unknown.garage_api."0.9.3"; garage_api = rustPackages.unknown.garage_api."0.10.0";
garage_web = rustPackages.unknown.garage_web."0.9.3"; garage_web = rustPackages.unknown.garage_web."0.10.0";
garage = rustPackages.unknown.garage."0.9.3"; garage = rustPackages.unknown.garage."0.10.0";
k2v-client = rustPackages.unknown.k2v-client."0.0.4"; k2v-client = rustPackages.unknown.k2v-client."0.0.4";
}; };
"registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec {
@ -88,6 +88,58 @@ in
src = fetchCratesIo { inherit name version; sha256 = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"; }; src = fetchCratesIo { inherit name version; sha256 = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".aead."0.5.2" = overridableMkRustCrate (profileName: rec {
name = "aead";
version = "0.5.2";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"; };
features = builtins.concatLists [
[ "alloc" ]
[ "getrandom" ]
[ "rand_core" ]
[ "stream" ]
];
dependencies = {
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".aes."0.8.4" = overridableMkRustCrate (profileName: rec {
name = "aes";
version = "0.8.4";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"; };
dependencies = {
cfg_if = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cfg-if."1.0.0" { inherit profileName; }).out;
cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out;
${ if hostPlatform.parsed.cpu.name == "aarch64" || hostPlatform.parsed.cpu.name == "x86_64" || hostPlatform.parsed.cpu.name == "i686" then "cpufeatures" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cpufeatures."0.2.12" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".aes-gcm."0.10.3" = overridableMkRustCrate (profileName: rec {
name = "aes-gcm";
version = "0.10.3";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"; };
features = builtins.concatLists [
[ "aes" ]
[ "alloc" ]
[ "default" ]
[ "getrandom" ]
[ "rand_core" ]
[ "stream" ]
];
dependencies = {
aead = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aead."0.5.2" { inherit profileName; }).out;
aes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aes."0.8.4" { inherit profileName; }).out;
cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out;
ctr = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".ctr."0.9.2" { inherit profileName; }).out;
ghash = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".ghash."0.5.1" { inherit profileName; }).out;
subtle = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".subtle."2.5.0" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".ahash."0.8.7" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".ahash."0.8.7" = overridableMkRustCrate (profileName: rec {
name = "ahash"; name = "ahash";
version = "0.8.7"; version = "0.8.7";
@ -1085,6 +1137,17 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" = overridableMkRustCrate (profileName: rec {
name = "cipher";
version = "0.4.4";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"; };
dependencies = {
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
inout = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".inout."0.1.3" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".clap."2.34.0" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".clap."2.34.0" = overridableMkRustCrate (profileName: rec {
name = "clap"; name = "clap";
version = "2.34.0"; version = "2.34.0";
@ -1252,21 +1315,6 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".crossbeam-epoch."0.9.18" = overridableMkRustCrate (profileName: rec {
name = "crossbeam-epoch";
version = "0.9.18";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"; };
features = builtins.concatLists [
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "alloc")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "std")
];
dependencies = {
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_utils" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-utils."0.8.19" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".crossbeam-queue."0.3.11" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".crossbeam-queue."0.3.11" = overridableMkRustCrate (profileName: rec {
name = "crossbeam-queue"; name = "crossbeam-queue";
version = "0.3.11"; version = "0.3.11";
@ -1288,7 +1336,6 @@ in
registry = "registry+https://github.com/rust-lang/crates.io-index"; registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"; }; src = fetchCratesIo { inherit name version; sha256 = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"; };
features = builtins.concatLists [ features = builtins.concatLists [
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default")
[ "std" ] [ "std" ]
]; ];
}); });
@ -1333,14 +1380,27 @@ in
registry = "registry+https://github.com/rust-lang/crates.io-index"; registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"; }; src = fetchCratesIo { inherit name version; sha256 = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"; };
features = builtins.concatLists [ features = builtins.concatLists [
[ "getrandom" ]
[ "rand_core" ]
[ "std" ] [ "std" ]
]; ];
dependencies = { dependencies = {
generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out; generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out;
rand_core = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand_core."0.6.4" { inherit profileName; }).out;
typenum = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".typenum."1.17.0" { inherit profileName; }).out; typenum = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".typenum."1.17.0" { inherit profileName; }).out;
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".ctr."0.9.2" = overridableMkRustCrate (profileName: rec {
name = "ctr";
version = "0.9.2";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"; };
dependencies = {
cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".darling."0.20.5" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".darling."0.20.5" = overridableMkRustCrate (profileName: rec {
name = "darling"; name = "darling";
version = "0.20.5"; version = "0.20.5";
@ -1699,17 +1759,6 @@ in
src = fetchCrateLocal (workspaceSrc + "/src/format-table"); src = fetchCrateLocal (workspaceSrc + "/src/format-table");
}); });
"registry+https://github.com/rust-lang/crates.io-index".fs2."0.4.3" = overridableMkRustCrate (profileName: rec {
name = "fs2";
version = "0.4.3";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"; };
dependencies = {
${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && hostPlatform.isUnix then "libc" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.153" { inherit profileName; }).out;
${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && hostPlatform.isWindows then "winapi" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".winapi."0.3.9" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" = overridableMkRustCrate (profileName: rec {
name = "futures"; name = "futures";
version = "0.3.30"; version = "0.3.30";
@ -1861,19 +1910,9 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".fxhash."0.2.1" = overridableMkRustCrate (profileName: rec { "unknown".garage."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "fxhash";
version = "0.2.1";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"; };
dependencies = {
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "byteorder" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".byteorder."1.5.0" { inherit profileName; }).out;
};
});
"unknown".garage."0.9.3" = overridableMkRustCrate (profileName: rec {
name = "garage"; name = "garage";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/garage"); src = fetchCrateLocal (workspaceSrc + "/src/garage");
features = builtins.concatLists [ features = builtins.concatLists [
@ -1887,7 +1926,6 @@ in
(lib.optional (rootFeatures' ? "garage/opentelemetry-otlp" || rootFeatures' ? "garage/telemetry-otlp") "opentelemetry-otlp") (lib.optional (rootFeatures' ? "garage/opentelemetry-otlp" || rootFeatures' ? "garage/telemetry-otlp") "opentelemetry-otlp")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/opentelemetry-prometheus") "opentelemetry-prometheus") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/opentelemetry-prometheus") "opentelemetry-prometheus")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/prometheus") "prometheus") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/prometheus") "prometheus")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled") "sled")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite") "sqlite") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite") "sqlite")
(lib.optional (rootFeatures' ? "garage/system-libs") "system-libs") (lib.optional (rootFeatures' ? "garage/system-libs") "system-libs")
(lib.optional (rootFeatures' ? "garage/telemetry-otlp") "telemetry-otlp") (lib.optional (rootFeatures' ? "garage/telemetry-otlp") "telemetry-otlp")
@ -1900,15 +1938,15 @@ in
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
garage_api = (rustPackages."unknown".garage_api."0.9.3" { inherit profileName; }).out; garage_api = (rustPackages."unknown".garage_api."0.10.0" { inherit profileName; }).out;
garage_block = (rustPackages."unknown".garage_block."0.9.3" { inherit profileName; }).out; garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.9.3" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out;
garage_model = (rustPackages."unknown".garage_model."0.9.3" { inherit profileName; }).out; garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out;
garage_net = (rustPackages."unknown".garage_net."0.9.3" { inherit profileName; }).out; garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out;
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.3" { inherit profileName; }).out; garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out;
garage_table = (rustPackages."unknown".garage_table."0.9.3" { inherit profileName; }).out; garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
garage_web = (rustPackages."unknown".garage_web."0.9.3" { inherit profileName; }).out; garage_web = (rustPackages."unknown".garage_web."0.10.0" { inherit profileName; }).out;
git_version = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.9" { inherit profileName; }).out; git_version = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.9" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
sodiumoxide = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; }).out; sodiumoxide = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; }).out;
@ -1946,9 +1984,9 @@ in
}; };
}); });
"unknown".garage_api."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_api."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_api"; name = "garage_api";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/api"); src = fetchCrateLocal (workspaceSrc + "/src/api");
features = builtins.concatLists [ features = builtins.concatLists [
@ -1958,7 +1996,9 @@ in
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage_api/metrics" || rootFeatures' ? "garage_api/prometheus") "prometheus") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage_api/metrics" || rootFeatures' ? "garage_api/prometheus") "prometheus")
]; ];
dependencies = { dependencies = {
aes_gcm = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aes-gcm."0.10.3" { inherit profileName; }).out;
argon2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".argon2."0.5.3" { inherit profileName; }).out; argon2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".argon2."0.5.3" { inherit profileName; }).out;
async_compression = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".async-compression."0.4.6" { inherit profileName; }).out;
async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.77" { profileName = "__noProfile"; }).out; async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.77" { profileName = "__noProfile"; }).out;
base64 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".base64."0.21.7" { inherit profileName; }).out; base64 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".base64."0.21.7" { inherit profileName; }).out;
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out;
@ -1968,12 +2008,12 @@ in
form_urlencoded = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".form_urlencoded."1.2.1" { inherit profileName; }).out; form_urlencoded = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".form_urlencoded."1.2.1" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
garage_block = (rustPackages."unknown".garage_block."0.9.3" { inherit profileName; }).out; garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out;
garage_model = (rustPackages."unknown".garage_model."0.9.3" { inherit profileName; }).out; garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out;
garage_net = (rustPackages."unknown".garage_net."0.9.3" { inherit profileName; }).out; garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out;
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.3" { inherit profileName; }).out; garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out;
garage_table = (rustPackages."unknown".garage_table."0.9.3" { inherit profileName; }).out; garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
hmac = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.12.1" { inherit profileName; }).out; hmac = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.12.1" { inherit profileName; }).out;
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
@ -1999,14 +2039,15 @@ in
sha2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha2."0.10.8" { inherit profileName; }).out; sha2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha2."0.10.8" { inherit profileName; }).out;
tokio = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.36.0" { inherit profileName; }).out; tokio = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.36.0" { inherit profileName; }).out;
tokio_stream = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.14" { inherit profileName; }).out; tokio_stream = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.14" { inherit profileName; }).out;
tokio_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-util."0.7.10" { inherit profileName; }).out;
tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out; tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out;
url = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".url."2.5.0" { inherit profileName; }).out; url = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".url."2.5.0" { inherit profileName; }).out;
}; };
}); });
"unknown".garage_block."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_block."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_block"; name = "garage_block";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/block"); src = fetchCrateLocal (workspaceSrc + "/src/block");
features = builtins.concatLists [ features = builtins.concatLists [
@ -2020,11 +2061,11 @@ in
bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out; bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.9.3" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out;
garage_net = (rustPackages."unknown".garage_net."0.9.3" { inherit profileName; }).out; garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out;
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.3" { inherit profileName; }).out; garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out;
garage_table = (rustPackages."unknown".garage_table."0.9.3" { inherit profileName; }).out; garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
@ -2037,9 +2078,9 @@ in
}; };
}); });
"unknown".garage_db."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_db."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_db"; name = "garage_db";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/db"); src = fetchCrateLocal (workspaceSrc + "/src/db");
features = builtins.concatLists [ features = builtins.concatLists [
@ -2050,7 +2091,6 @@ in
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "r2d2") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "r2d2")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2_sqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "r2d2_sqlite") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2_sqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "r2d2_sqlite")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "rusqlite") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "rusqlite")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "sled")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite")
]; ];
dependencies = { dependencies = {
@ -2060,7 +2100,6 @@ in
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "r2d2" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".r2d2."0.8.10" { inherit profileName; }).out; ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "r2d2" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".r2d2."0.8.10" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2_sqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "r2d2_sqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".r2d2_sqlite."0.24.0" { inherit profileName; }).out; ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/r2d2_sqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "r2d2_sqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".r2d2_sqlite."0.24.0" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "rusqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rusqlite."0.31.0" { inherit profileName; }).out; ${ if rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "rusqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rusqlite."0.31.0" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "sled" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sled."0.34.7" { inherit profileName; }).out;
tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out; tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out;
}; };
devDependencies = { devDependencies = {
@ -2068,16 +2107,15 @@ in
}; };
}); });
"unknown".garage_model."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_model."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_model"; name = "garage_model";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/model"); src = fetchCrateLocal (workspaceSrc + "/src/model");
features = builtins.concatLists [ features = builtins.concatLists [
(lib.optional (rootFeatures' ? "garage_model/default") "default") (lib.optional (rootFeatures' ? "garage_model/default") "default")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/k2v" || rootFeatures' ? "garage_api/k2v" || rootFeatures' ? "garage_model/k2v") "k2v") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/k2v" || rootFeatures' ? "garage_api/k2v" || rootFeatures' ? "garage_model/k2v") "k2v")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb") "lmdb") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb") "lmdb")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "sled")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite")
]; ];
dependencies = { dependencies = {
@ -2089,13 +2127,14 @@ in
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
garage_block = (rustPackages."unknown".garage_block."0.9.3" { inherit profileName; }).out; garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.9.3" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out;
garage_net = (rustPackages."unknown".garage_net."0.9.3" { inherit profileName; }).out; garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out;
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.3" { inherit profileName; }).out; garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out;
garage_table = (rustPackages."unknown".garage_table."0.9.3" { inherit profileName; }).out; garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out; parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out;
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
@ -2107,9 +2146,9 @@ in
}; };
}); });
"unknown".garage_net."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_net."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_net"; name = "garage_net";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/net"); src = fetchCrateLocal (workspaceSrc + "/src/net");
features = builtins.concatLists [ features = builtins.concatLists [
@ -2144,9 +2183,9 @@ in
}; };
}); });
"unknown".garage_rpc."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_rpc."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_rpc"; name = "garage_rpc";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/rpc"); src = fetchCrateLocal (workspaceSrc + "/src/rpc");
features = builtins.concatLists [ features = builtins.concatLists [
@ -2168,9 +2207,9 @@ in
format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.9.3" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out;
garage_net = (rustPackages."unknown".garage_net."0.9.3" { inherit profileName; }).out; garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
gethostname = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.4.3" { inherit profileName; }).out; gethostname = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.4.3" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
itertools = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".itertools."0.12.1" { inherit profileName; }).out; itertools = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".itertools."0.12.1" { inherit profileName; }).out;
@ -2192,9 +2231,9 @@ in
}; };
}); });
"unknown".garage_table."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_table."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_table"; name = "garage_table";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/table"); src = fetchCrateLocal (workspaceSrc + "/src/table");
dependencies = { dependencies = {
@ -2203,9 +2242,9 @@ in
bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.9.3" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out;
garage_rpc = (rustPackages."unknown".garage_rpc."0.9.3" { inherit profileName; }).out; garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out;
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
@ -2217,9 +2256,9 @@ in
}; };
}); });
"unknown".garage_util."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_util."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_util"; name = "garage_util";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/util"); src = fetchCrateLocal (workspaceSrc + "/src/util");
features = builtins.concatLists [ features = builtins.concatLists [
@ -2235,8 +2274,8 @@ in
digest = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".digest."0.10.7" { inherit profileName; }).out; digest = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".digest."0.10.7" { inherit profileName; }).out;
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
garage_db = (rustPackages."unknown".garage_db."0.9.3" { inherit profileName; }).out; garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out;
garage_net = (rustPackages."unknown".garage_net."0.9.3" { inherit profileName; }).out; garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out;
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
@ -2261,18 +2300,18 @@ in
}; };
}); });
"unknown".garage_web."0.9.3" = overridableMkRustCrate (profileName: rec { "unknown".garage_web."0.10.0" = overridableMkRustCrate (profileName: rec {
name = "garage_web"; name = "garage_web";
version = "0.9.3"; version = "0.10.0";
registry = "unknown"; registry = "unknown";
src = fetchCrateLocal (workspaceSrc + "/src/web"); src = fetchCrateLocal (workspaceSrc + "/src/web");
dependencies = { dependencies = {
err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out;
futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out;
garage_api = (rustPackages."unknown".garage_api."0.9.3" { inherit profileName; }).out; garage_api = (rustPackages."unknown".garage_api."0.10.0" { inherit profileName; }).out;
garage_model = (rustPackages."unknown".garage_model."0.9.3" { inherit profileName; }).out; garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out;
garage_table = (rustPackages."unknown".garage_table."0.9.3" { inherit profileName; }).out; garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out;
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out;
http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out;
http_body_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http-body-util."0.1.0" { inherit profileName; }).out; http_body_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http-body-util."0.1.0" { inherit profileName; }).out;
hyper = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."1.1.0" { inherit profileName; }).out; hyper = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."1.1.0" { inherit profileName; }).out;
@ -2326,6 +2365,17 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".ghash."0.5.1" = overridableMkRustCrate (profileName: rec {
name = "ghash";
version = "0.5.1";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"; };
dependencies = {
opaque_debug = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" { inherit profileName; }).out;
polyval = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".polyval."0.6.2" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".gimli."0.28.1" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".gimli."0.28.1" = overridableMkRustCrate (profileName: rec {
name = "gimli"; name = "gimli";
version = "0.28.1"; version = "0.28.1";
@ -2933,6 +2983,16 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".inout."0.1.3" = overridableMkRustCrate (profileName: rec {
name = "inout";
version = "0.1.3";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"; };
dependencies = {
generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".instant."0.1.12" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".instant."0.1.12" = overridableMkRustCrate (profileName: rec {
name = "instant"; name = "instant";
version = "0.1.12"; version = "0.1.12";
@ -3782,6 +3842,13 @@ in
]; ];
}); });
"registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" = overridableMkRustCrate (profileName: rec {
name = "opaque-debug";
version = "0.3.1";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"; };
});
"registry+https://github.com/rust-lang/crates.io-index".openssl-probe."0.1.5" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".openssl-probe."0.1.5" = overridableMkRustCrate (profileName: rec {
name = "openssl-probe"; name = "openssl-probe";
version = "0.1.5"; version = "0.1.5";
@ -4241,6 +4308,19 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".polyval."0.6.2" = overridableMkRustCrate (profileName: rec {
name = "polyval";
version = "0.6.2";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"; };
dependencies = {
cfg_if = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cfg-if."1.0.0" { inherit profileName; }).out;
${ if hostPlatform.parsed.cpu.name == "aarch64" || hostPlatform.parsed.cpu.name == "x86_64" || hostPlatform.parsed.cpu.name == "i686" then "cpufeatures" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cpufeatures."0.2.12" { inherit profileName; }).out;
opaque_debug = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" { inherit profileName; }).out;
universal_hash = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".universal-hash."0.5.1" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".powerfmt."0.2.0" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".powerfmt."0.2.0" = overridableMkRustCrate (profileName: rec {
name = "powerfmt"; name = "powerfmt";
version = "0.2.0"; version = "0.2.0";
@ -5378,27 +5458,6 @@ in
}; };
}); });
"registry+https://github.com/rust-lang/crates.io-index".sled."0.34.7" = overridableMkRustCrate (profileName: rec {
name = "sled";
version = "0.34.7";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935"; };
features = builtins.concatLists [
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default")
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "no_metrics")
];
dependencies = {
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crc32fast" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_epoch" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-epoch."0.9.18" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_utils" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-utils."0.8.19" { inherit profileName; }).out;
${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && (hostPlatform.parsed.kernel.name == "linux" || hostPlatform.parsed.kernel.name == "darwin" || hostPlatform.parsed.kernel.name == "windows") then "fs2" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".fs2."0.4.3" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "fxhash" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".fxhash."0.2.1" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "libc" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.153" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "log" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.20" { inherit profileName; }).out;
${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "parking_lot" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parking_lot."0.11.2" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".smallvec."1.13.1" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".smallvec."1.13.1" = overridableMkRustCrate (profileName: rec {
name = "smallvec"; name = "smallvec";
version = "1.13.1"; version = "1.13.1";
@ -6354,6 +6413,17 @@ in
]; ];
}); });
"registry+https://github.com/rust-lang/crates.io-index".universal-hash."0.5.1" = overridableMkRustCrate (profileName: rec {
name = "universal-hash";
version = "0.5.1";
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"; };
dependencies = {
crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out;
subtle = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".subtle."2.5.0" { inherit profileName; }).out;
};
});
"registry+https://github.com/rust-lang/crates.io-index".unsafe-libyaml."0.2.10" = overridableMkRustCrate (profileName: rec { "registry+https://github.com/rust-lang/crates.io-index".unsafe-libyaml."0.2.10" = overridableMkRustCrate (profileName: rec {
name = "unsafe-libyaml"; name = "unsafe-libyaml";
version = "0.2.10"; version = "0.2.10";
@ -6634,7 +6704,6 @@ in
[ "minwindef" ] [ "minwindef" ]
[ "ntstatus" ] [ "ntstatus" ]
[ "processenv" ] [ "processenv" ]
(lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "processthreadsapi")
[ "std" ] [ "std" ]
[ "synchapi" ] [ "synchapi" ]
[ "sysinfoapi" ] [ "sysinfoapi" ]

View file

@ -21,15 +21,15 @@ default-members = ["src/garage"]
# Internal Garage crates # Internal Garage crates
format_table = { version = "0.1.1", path = "src/format-table" } format_table = { version = "0.1.1", path = "src/format-table" }
garage_api = { version = "0.9.3", path = "src/api" } garage_api = { version = "0.10.0", path = "src/api" }
garage_block = { version = "0.9.3", path = "src/block" } garage_block = { version = "0.10.0", path = "src/block" }
garage_db = { version = "0.9.3", path = "src/db", default-features = false } garage_db = { version = "0.10.0", path = "src/db", default-features = false }
garage_model = { version = "0.9.3", path = "src/model", default-features = false } garage_model = { version = "0.10.0", path = "src/model", default-features = false }
garage_net = { version = "0.9.3", path = "src/net" } garage_net = { version = "0.10.0", path = "src/net" }
garage_rpc = { version = "0.9.3", path = "src/rpc" } garage_rpc = { version = "0.10.0", path = "src/rpc" }
garage_table = { version = "0.9.3", path = "src/table" } garage_table = { version = "0.10.0", path = "src/table" }
garage_util = { version = "0.9.3", path = "src/util" } garage_util = { version = "0.10.0", path = "src/util" }
garage_web = { version = "0.9.3", path = "src/web" } garage_web = { version = "0.10.0", path = "src/web" }
k2v-client = { version = "0.0.4", path = "src/k2v-client" } k2v-client = { version = "0.0.4", path = "src/k2v-client" }
# External crates from crates.io # External crates from crates.io
@ -66,6 +66,7 @@ sha2 = "0.10"
timeago = { version = "0.4", default-features = false } timeago = { version = "0.4", default-features = false }
xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] }
aes-gcm = { version = "0.10", features = ["aes", "stream"] }
sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" }
kuska-handshake = { version = "0.2.0", features = ["default", "async_std"] } kuska-handshake = { version = "0.2.0", features = ["default", "async_std"] }
@ -79,7 +80,6 @@ heed = { version = "0.11", default-features = false, features = ["lmdb"] }
rusqlite = "0.31.0" rusqlite = "0.31.0"
r2d2 = "0.8" r2d2 = "0.8"
r2d2_sqlite = "0.24" r2d2_sqlite = "0.24"
sled = "0.34"
async-compression = { version = "0.4", features = ["tokio", "zstd"] } async-compression = { version = "0.4", features = ["tokio", "zstd"] }
zstd = { version = "0.13", default-features = false } zstd = { version = "0.13", default-features = false }

View file

@ -40,7 +40,6 @@ in {
features = [ features = [
"garage/bundled-libs" "garage/bundled-libs"
"garage/k2v" "garage/k2v"
"garage/sled"
"garage/lmdb" "garage/lmdb"
"garage/sqlite" "garage/sqlite"
]; ];

View file

@ -98,7 +98,6 @@ paths:
type: string type: string
example: example:
- "k2v" - "k2v"
- "sled"
- "lmdb" - "lmdb"
- "sqlite" - "sqlite"
- "consul-discovery" - "consul-discovery"

View file

@ -80,6 +80,53 @@ To test your new configuration, just reload your Nextcloud webpage and start sen
*External link:* [Nextcloud Documentation > Primary Storage](https://docs.nextcloud.com/server/latest/admin_manual/configuration_files/primary_storage.html) *External link:* [Nextcloud Documentation > Primary Storage](https://docs.nextcloud.com/server/latest/admin_manual/configuration_files/primary_storage.html)
#### SSE-C encryption (since Garage v1.0)
Since version 1.0, Garage supports server-side encryption with customer keys
(SSE-C). In this mode, Garage is responsible for encrypting and decrypting
objects, but it does not store the encryption key itself. The encryption key
should be provided by Nextcloud upon each request. This mode of operation is
supported by Nextcloud and it has successfully been tested together with
Garage.
To enable SSE-C encryption:
1. Make sure your Garage server is accessible via SSL through a reverse proxy
such as Nginx, and that it is using a valid public certificate (Nextcloud
might be able to connect to an S3 server that is using a self-signed
certificate, but you will lose many hours while trying, so don't).
Configure values for `use_ssl` and `port` accordingly in your `config.php`
file.
2. Generate an encryption key using the following command:
```
openssl rand -base64 32
```
Make sure to keep this key **secret**!
3. Add the encryption key in your `config.php` file as follows:
```php
<?php
$CONFIG = array(
'objectstore' => [
'class' => '\\OC\\Files\\ObjectStore\\S3',
'arguments' => [
...
'sse_c_key' => 'exampleencryptionkeyLbU+5fKYQcVoqnn+RaIOXgo=',
...
],
],
```
Nextcloud will now make Garage encrypt files at rest in the storage bucket.
These files will not be readable by an S3 client that has credentials to the
bucket but doesn't also know the secret encryption key.
### External Storage ### External Storage
**From the GUI.** Activate the "External storage support" app from the "Applications" page (click on your account icon on the top right corner of your screen to display the menu). Go to your parameters page (also located below your account icon). Click on external storage (or the corresponding translation in your language). **From the GUI.** Activate the "External storage support" app from the "Applications" page (click on your account icon on the top right corner of your screen to display the menu). Go to your parameters page (also located below your account icon). Click on external storage (or the corresponding translation in your language).
@ -245,7 +292,7 @@ with average object size ranging from 50 KB to 150 KB.
As such, your Garage cluster should be configured appropriately for good performance: As such, your Garage cluster should be configured appropriately for good performance:
- use Garage v0.8.0 or higher with the [LMDB database engine](@documentation/reference-manual/configuration.md#db-engine-since-v0-8-0). - use Garage v0.8.0 or higher with the [LMDB database engine](@documentation/reference-manual/configuration.md#db-engine-since-v0-8-0).
With the default Sled database engine, your database could quickly end up taking tens of GB of disk space. Older versions of Garage used the Sled database engine which had issues, such as databases quickly ending up taking tens of GB of disk space.
- the Garage database should be stored on a SSD - the Garage database should be stored on a SSD
### Creating your bucket ### Creating your bucket

View file

@ -90,6 +90,5 @@ The following feature flags are available in v0.8.0:
| `kubernetes-discovery` | optional | Enable automatic registration and discovery<br>of cluster nodes through the Kubernetes API | | `kubernetes-discovery` | optional | Enable automatic registration and discovery<br>of cluster nodes through the Kubernetes API |
| `metrics` | *by default* | Enable collection of metrics in Prometheus format on the admin API | | `metrics` | *by default* | Enable collection of metrics in Prometheus format on the admin API |
| `telemetry-otlp` | optional | Enable collection of execution traces using OpenTelemetry | | `telemetry-otlp` | optional | Enable collection of execution traces using OpenTelemetry |
| `sled` | *by default* | Enable using Sled to store Garage's metadata |
| `lmdb` | *by default* | Enable using LMDB to store Garage's metadata | | `lmdb` | *by default* | Enable using LMDB to store Garage's metadata |
| `sqlite` | *by default* | Enable using Sqlite3 to store Garage's metadata | | `sqlite` | *by default* | Enable using Sqlite3 to store Garage's metadata |

View file

@ -68,6 +68,11 @@ to store 2 TB of data in total.
EXT4 is not recommended as it has more strict limitations on the number of inodes, EXT4 is not recommended as it has more strict limitations on the number of inodes,
which might cause issues with Garage when large numbers of objects are stored. which might cause issues with Garage when large numbers of objects are stored.
- If you only have an HDD and no SSD, it's fine to put your metadata alongside the data
on the same drive. Having lots of RAM for your kernel to cache the metadata will
help a lot with performance. The default LMDB database engine is the most tested
and has good performance.
- Servers with multiple HDDs are supported natively by Garage without resorting - Servers with multiple HDDs are supported natively by Garage without resorting
to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md). to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md).
@ -87,11 +92,6 @@ to store 2 TB of data in total.
and 2/ LMDB is not suited for 32-bit platforms. Sqlite is a viable alternative and 2/ LMDB is not suited for 32-bit platforms. Sqlite is a viable alternative
if any of these are of concern. if any of these are of concern.
- If you only have an HDD and no SSD, it's fine to put your metadata alongside
the data on the same drive, but then consider your filesystem choice wisely
(see above). Having lots of RAM for your kernel to cache the metadata will
help a lot with performance.
## Get a Docker image ## Get a Docker image
Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated).
@ -127,7 +127,7 @@ data_dir = "/var/lib/garage/data"
db_engine = "lmdb" db_engine = "lmdb"
metadata_auto_snapshot_interval = "6h" metadata_auto_snapshot_interval = "6h"
replication_mode = "3" replication_factor = 3
compression_level = 2 compression_level = 2

View file

@ -97,7 +97,7 @@ delete a tombstone, the following condition has to be met:
superseeded by the tombstone. This ensures that deleting the tombstone is superseeded by the tombstone. This ensures that deleting the tombstone is
safe and that no deleted value will come back in the system. safe and that no deleted value will come back in the system.
Garage makes use of Sled's atomic operations (such as compare-and-swap and Garage uses atomic database operations (such as compare-and-swap and
transactions) to ensure that only tombstones that have been correctly transactions) to ensure that only tombstones that have been correctly
propagated to other nodes are ever deleted from the local entry tree. propagated to other nodes are ever deleted from the local entry tree.

View file

@ -12,7 +12,7 @@ An introduction to building cluster layouts can be found in the [production depl
In Garage, all of the data that can be stored in a given cluster is divided In Garage, all of the data that can be stored in a given cluster is divided
into slices which we call *partitions*. Each partition is stored by into slices which we call *partitions*. Each partition is stored by
one or several nodes in the cluster one or several nodes in the cluster
(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication_mode)). (see [`replication_factor`](@/documentation/reference-manual/configuration.md#replication_factor)).
The layout determines the correspondence between these partitions, The layout determines the correspondence between these partitions,
which exist on a logical level, and actual storage nodes. which exist on a logical level, and actual storage nodes.

View file

@ -59,7 +59,7 @@ metadata_dir = "/tmp/meta"
data_dir = "/tmp/data" data_dir = "/tmp/data"
db_engine = "sqlite" db_engine = "sqlite"
replication_mode = "none" replication_factor = 1
rpc_bind_addr = "[::]:3901" rpc_bind_addr = "[::]:3901"
rpc_public_addr = "127.0.0.1:3901" rpc_public_addr = "127.0.0.1:3901"

View file

@ -8,7 +8,8 @@ weight = 20
Here is an example `garage.toml` configuration file that illustrates all of the possible options: Here is an example `garage.toml` configuration file that illustrates all of the possible options:
```toml ```toml
replication_mode = "3" replication_factor = 3
consistency_mode = "consistent"
metadata_dir = "/var/lib/garage/meta" metadata_dir = "/var/lib/garage/meta"
data_dir = "/var/lib/garage/data" data_dir = "/var/lib/garage/data"
@ -21,8 +22,6 @@ db_engine = "lmdb"
block_size = "1M" block_size = "1M"
sled_cache_capacity = "128MiB"
sled_flush_every_ms = 2000
lmdb_map_size = "1T" lmdb_map_size = "1T"
compression_level = 1 compression_level = 1
@ -94,13 +93,12 @@ Top-level configuration options:
[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval), [`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval),
[`metadata_dir`](#metadata_dir), [`metadata_dir`](#metadata_dir),
[`metadata_fsync`](#metadata_fsync), [`metadata_fsync`](#metadata_fsync),
[`replication_mode`](#replication_mode), [`replication_factor`](#replication_factor),
[`consistency_mode`](#consistency_mode),
[`rpc_bind_addr`](#rpc_bind_addr), [`rpc_bind_addr`](#rpc_bind_addr),
[`rpc_bind_outgoing`](#rpc_bind_outgoing), [`rpc_bind_outgoing`](#rpc_bind_outgoing),
[`rpc_public_addr`](#rpc_public_addr), [`rpc_public_addr`](#rpc_public_addr),
[`rpc_secret`/`rpc_secret_file`](#rpc_secret), [`rpc_secret`/`rpc_secret_file`](#rpc_secret).
[`sled_cache_capacity`](#sled_cache_capacity),
[`sled_flush_every_ms`](#sled_flush_every_ms).
The `[consul_discovery]` section: The `[consul_discovery]` section:
[`api`](#consul_api), [`api`](#consul_api),
@ -137,11 +135,12 @@ The `[admin]` section:
### Top-level configuration options ### Top-level configuration options
#### `replication_mode` {#replication_mode} #### `replication_factor` {#replication_factor}
Garage supports the following replication modes: The replication factor can be any positive integer smaller or equal the node count in your cluster.
The chosen replication factor has a big impact on the cluster's failure tolerancy and performance characteristics.
- `none` or `1`: data stored on Garage is stored on a single node. There is no - `1`: data stored on Garage is stored on a single node. There is no
redundancy, and data will be unavailable as soon as one node fails or its redundancy, and data will be unavailable as soon as one node fails or its
network is disconnected. Do not use this for anything else than test network is disconnected. Do not use this for anything else than test
deployments. deployments.
@ -152,17 +151,6 @@ Garage supports the following replication modes:
before losing data. Data remains available in read-only mode when one node is before losing data. Data remains available in read-only mode when one node is
down, but write operations will fail. down, but write operations will fail.
- `2-dangerous`: a variant of mode `2`, where written objects are written to
the second replica asynchronously. This means that Garage will return `200
OK` to a PutObject request before the second copy is fully written (or even
before it even starts being written). This means that data can more easily
be lost if the node crashes before a second copy can be completed. This
also means that written objects might not be visible immediately in read
operations. In other words, this mode severely breaks the consistency and
durability guarantees of standard Garage cluster operation. Benefits of
this mode: you can still write to your cluster when one node is
unavailable.
- `3`: data stored on Garage will be stored on three different nodes, if - `3`: data stored on Garage will be stored on three different nodes, if
possible each in a different zones. Garage tolerates two node failure, or possible each in a different zones. Garage tolerates two node failure, or
several node failures but in no more than two zones (in a deployment with at several node failures but in no more than two zones (in a deployment with at
@ -170,55 +158,84 @@ Garage supports the following replication modes:
or node failures are only in a single zone, reading and writing data to or node failures are only in a single zone, reading and writing data to
Garage can continue normally. Garage can continue normally.
- `3-degraded`: a variant of replication mode `3`, that lowers the read - `5`, `7`, ...: When setting the replication factor above 3, it is most useful to
quorum to `1`, to allow you to read data from your cluster when several choose an uneven value, since for every two copies added, one more node can fail
nodes (or nodes in several zones) are unavailable. In this mode, Garage before losing the ability to write and read to the cluster.
does not provide read-after-write consistency anymore. The write quorum is
still 2, ensuring that data successfully written to Garage is stored on at
least two nodes.
- `3-dangerous`: a variant of replication mode `3` that lowers both the read
and write quorums to `1`, to allow you to both read and write to your
cluster when several nodes (or nodes in several zones) are unavailable. It
is the least consistent mode of operation proposed by Garage, and also one
that should probably never be used.
Note that in modes `2` and `3`, Note that in modes `2` and `3`,
if at least the same number of zones are available, an arbitrary number of failures in if at least the same number of zones are available, an arbitrary number of failures in
any given zone is tolerated as copies of data will be spread over several zones. any given zone is tolerated as copies of data will be spread over several zones.
**Make sure `replication_mode` is the same in the configuration files of all nodes. **Make sure `replication_factor` is the same in the configuration files of all nodes.
Never run a Garage cluster where that is not the case.** Never run a Garage cluster where that is not the case.**
It is technically possible to change the replication factor although it's a
dangerous operation that is not officially supported. This requires you to
delete the existing cluster layout and create a new layout from scratch,
meaning that a full rebalancing of your cluster's data will be needed. To do
it, shut down your cluster entirely, delete the `custer_layout` files in the
meta directories of all your nodes, update all your configuration files with
the new `replication_factor` parameter, restart your cluster, and then create a
new layout with all the nodes you want to keep. Rebalancing data will take
some time, and data might temporarily appear unavailable to your users.
It is recommended to shut down public access to the cluster while rebalancing
is in progress. In theory, no data should be lost as rebalancing is a
routine operation for Garage, although we cannot guarantee you that everything
will go right in such an extreme scenario.
#### `consistency_mode` {#consistency_mode}
The consistency mode setting determines the read and write behaviour of your cluster.
- `consistent`: The default setting. This is what the paragraph above describes.
The read and write quorum will be determined so that read-after-write consistency
is guaranteed.
- `degraded`: Lowers the read
quorum to `1`, to allow you to read data from your cluster when several
nodes (or nodes in several zones) are unavailable. In this mode, Garage
does not provide read-after-write consistency anymore.
The write quorum stays the same as in the `consistent` mode, ensuring that
data successfully written to Garage is stored on multiple nodes (depending
the replication factor).
- `dangerous`: This mode lowers both the read
and write quorums to `1`, to allow you to both read and write to your
cluster when several nodes (or nodes in several zones) are unavailable. It
is the least consistent mode of operation proposed by Garage, and also one
that should probably never be used.
Changing the `consistency_mode` between modes while leaving the `replication_factor` untouched
(e.g. setting your node's `consistency_mode` to `degraded` when it was previously unset, or from
`dangerous` to `consistent`), can be done easily by just changing the `consistency_mode`
parameter in your config files and restarting all your Garage nodes.
The consistency mode can be used together with various replication factors, to achieve
a wide range of read and write characteristics. Some examples:
- Replication factor `2`, consistency mode `degraded`: While this mode
technically exists, its properties are the same as with consistency mode `consistent`,
since the read quorum with replication factor `2`, consistency mode `consistent` is already 1.
- Replication factor `2`, consistency mode `dangerous`: written objects are written to
the second replica asynchronously. This means that Garage will return `200
OK` to a PutObject request before the second copy is fully written (or even
before it even starts being written). This means that data can more easily
be lost if the node crashes before a second copy can be completed. This
also means that written objects might not be visible immediately in read
operations. In other words, this configuration severely breaks the consistency and
durability guarantees of standard Garage cluster operation. Benefits of
this configuration: you can still write to your cluster when one node is
unavailable.
The quorums associated with each replication mode are described below: The quorums associated with each replication mode are described below:
| `replication_mode` | Number of replicas | Write quorum | Read quorum | Read-after-write consistency? | | `consistency_mode` | `replication_factor` | Write quorum | Read quorum | Read-after-write consistency? |
| ------------------ | ------------------ | ------------ | ----------- | ----------------------------- | | ------------------ | -------------------- | ------------ | ----------- | ----------------------------- |
| `none` or `1` | 1 | 1 | 1 | yes | | `consistent` | 1 | 1 | 1 | yes |
| `2` | 2 | 2 | 1 | yes | | `consistent` | 2 | 2 | 1 | yes |
| `2-dangerous` | 2 | 1 | 1 | NO | | `dangerous` | 2 | 1 | 1 | NO |
| `3` | 3 | 2 | 2 | yes | | `consistent` | 3 | 2 | 2 | yes |
| `3-degraded` | 3 | 2 | 1 | NO | | `degraded` | 3 | 2 | 1 | NO |
| `3-dangerous` | 3 | 1 | 1 | NO | | `dangerous` | 3 | 1 | 1 | NO |
Changing the `replication_mode` between modes with the same number of replicas
(e.g. from `3` to `3-degraded`, or from `2-dangerous` to `2`), can be done easily by
just changing the `replication_mode` parameter in your config files and restarting all your
Garage nodes.
It is also technically possible to change the replication mode to a mode with a
different numbers of replicas, although it's a dangerous operation that is not
officially supported. This requires you to delete the existing cluster layout
and create a new layout from scratch, meaning that a full rebalancing of your
cluster's data will be needed. To do it, shut down your cluster entirely,
delete the `custer_layout` files in the meta directories of all your nodes,
update all your configuration files with the new `replication_mode` parameter,
restart your cluster, and then create a new layout with all the nodes you want
to keep. Rebalancing data will take some time, and data might temporarily
appear unavailable to your users. It is recommended to shut down public access
to the cluster while rebalancing is in progress. In theory, no data should be
lost as rebalancing is a routine operation for Garage, although we cannot
guarantee you that everything will go right in such an extreme scenario.
#### `metadata_dir` {#metadata_dir} #### `metadata_dir` {#metadata_dir}
@ -254,23 +271,18 @@ Since `v0.8.0`, Garage can use alternative storage backends as follows:
| DB engine | `db_engine` value | Database path | | DB engine | `db_engine` value | Database path |
| --------- | ----------------- | ------------- | | --------- | ----------------- | ------------- |
| [LMDB](https://www.lmdb.tech) (default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` | | [LMDB](https://www.lmdb.tech) (since `v0.8.0`, default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` |
| [Sled](https://sled.rs) (default up to `v0.8.0`) | `"sled"` | `<metadata_dir>/db/` | | [Sqlite](https://sqlite.org) (since `v0.8.0`) | `"sqlite"` | `<metadata_dir>/db.sqlite` |
| [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` | | [Sled](https://sled.rs) (old default, removed since `v1.0`) | `"sled"` | `<metadata_dir>/db/` |
Sled was the only database engine up to Garage v0.7.0. Performance issues and Sled was supported until Garage v0.9.x, and was removed in Garage v1.0.
API limitations of Sled prompted the addition of alternative engines in v0.8.0. You can still use an older binary of Garage (e.g. v0.9.3) to migrate
Since v0.9.0, LMDB is the default engine instead of Sled, and Sled is old Sled metadata databases to another engine.
deprecated. We plan to remove Sled in Garage v1.0.
Performance characteristics of the different DB engines are as follows: Performance characteristics of the different DB engines are as follows:
- Sled: tends to produce large data files and also has performance issues, - LMDB: the recommended database engine for high-performance distributed clusters.
especially when the metadata folder is on a traditional HDD and not on SSD. LMDB works very well, but is known to have the following limitations:
- LMDB: the recommended database engine for high-performance distributed
clusters, much more space-efficient and significantly faster. LMDB works very
well, but is known to have the following limitations:
- The data format of LMDB is not portable between architectures, so for - The data format of LMDB is not portable between architectures, so for
instance the Garage database of an x86-64 node cannot be moved to an ARM64 instance the Garage database of an x86-64 node cannot be moved to an ARM64
@ -286,6 +298,9 @@ Performance characteristics of the different DB engines are as follows:
other nodes), or if you have saved regular snapshots at the filesystem other nodes), or if you have saved regular snapshots at the filesystem
level. level.
- Keys in LMDB are limited to 511 bytes. This limit translates to limits on
object keys in S3 and sort keys in K2V that are limted to 479 bytes.
- Sqlite: Garage supports Sqlite as an alternative storage backend for - Sqlite: Garage supports Sqlite as an alternative storage backend for
metadata, which does not have the issues listed above for LMDB. metadata, which does not have the issues listed above for LMDB.
On versions 0.8.x and earlier, Sqlite should be avoided due to abysmal On versions 0.8.x and earlier, Sqlite should be avoided due to abysmal
@ -329,7 +344,6 @@ Here is how this option impacts the different database engines:
| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` | | Database | `metadata_fsync = false` (default) | `metadata_fsync = true` |
|----------|------------------------------------|-------------------------------| |----------|------------------------------------|-------------------------------|
| Sled | default options | *unsupported* |
| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` | | Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` | | LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
@ -398,21 +412,6 @@ files will remain available. This however means that chunks from existing files
will not be deduplicated with chunks from newly uploaded files, meaning you will not be deduplicated with chunks from newly uploaded files, meaning you
might use more storage space that is optimally possible. might use more storage space that is optimally possible.
#### `sled_cache_capacity` {#sled_cache_capacity}
This parameter can be used to tune the capacity of the cache used by
[sled](https://sled.rs), the database Garage uses internally to store metadata.
Tune this to fit the RAM you wish to make available to your Garage instance.
This value has a conservative default (128MB) so that Garage doesn't use too much
RAM by default, but feel free to increase this for higher performance.
#### `sled_flush_every_ms` {#sled_flush_every_ms}
This parameters can be used to tune the flushing interval of sled.
Increase this if sled is thrashing your SSD, at the risk of losing more data in case
of a power outage (though this should not matter much as data is replicated on other
nodes). The default value, 2000ms, should be appropriate for most use cases.
#### `lmdb_map_size` {#lmdb_map_size} #### `lmdb_map_size` {#lmdb_map_size}
This parameters can be used to set the map size used by LMDB, This parameters can be used to set the map size used by LMDB,

View file

@ -39,10 +39,10 @@ Read about cluster layout management [here](@/documentation/operations/layout.md
### Several replication modes ### Several replication modes
Garage supports a variety of replication modes, with 1 copy, 2 copies or 3 copies of your data, Garage supports a variety of replication modes, with configurable replica count,
and with various levels of consistency, in order to adapt to a variety of usage scenarios. and with various levels of consistency, in order to adapt to a variety of usage scenarios.
Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_mode) Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_factor)
to select the replication mode best suited to your use case (hint: in most cases, `replication_mode = "3"` is what you want). to select the replication mode best suited to your use case (hint: in most cases, `replication_factor = 3` is what you want).
### Compression and deduplication ### Compression and deduplication

View file

@ -33,6 +33,7 @@ Feel free to open a PR to suggest fixes this table. Minio is missing because the
| [URL path-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access) (eg. `host.tld/bucket/key`) | ✅ Implemented | ✅ | ✅ | ❓| ✅ | | [URL path-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access) (eg. `host.tld/bucket/key`) | ✅ Implemented | ✅ | ✅ | ❓| ✅ |
| [URL vhost-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access) URL (eg. `bucket.host.tld/key`) | ✅ Implemented | ❌| ✅| ✅ | ✅ | | [URL vhost-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access) URL (eg. `bucket.host.tld/key`) | ✅ Implemented | ❌| ✅| ✅ | ✅ |
| [Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) | ✅ Implemented | ❌| ✅ | ✅ | ✅(❓) | | [Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) | ✅ Implemented | ❌| ✅ | ✅ | ✅(❓) |
| [SSE-C encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html) | ✅ Implemented | ❓ | ✅ | ❌ | ✅ |
*Note:* OpenIO does not says if it supports presigned URLs. Because it is part *Note:* OpenIO does not says if it supports presigned URLs. Because it is part
of signature v4 and they claim they support it without additional precisions, of signature v4 and they claim they support it without additional precisions,

View file

@ -69,11 +69,10 @@ Example response body:
```json ```json
{ {
"node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"garageVersion": "git:v0.9.0-dev", "garageVersion": "v0.10.0",
"garageFeatures": [ "garageFeatures": [
"k2v", "k2v",
"sled",
"lmdb", "lmdb",
"sqlite", "sqlite",
"metrics", "metrics",
@ -81,83 +80,92 @@ Example response body:
], ],
"rustVersion": "1.68.0", "rustVersion": "1.68.0",
"dbEngine": "LMDB (using Heed crate)", "dbEngine": "LMDB (using Heed crate)",
"knownNodes": [ "layoutVersion": 5,
"nodes": [
{ {
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
"addr": "10.0.0.11:3901", "role": {
"id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
"zone": "dc1",
"capacity": 100000000000,
"tags": []
},
"addr": "10.0.0.3:3901",
"hostname": "node3",
"isUp": true, "isUp": true,
"lastSeenSecsAgo": 9, "lastSeenSecsAgo": 12,
"hostname": "node1" "draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
}, },
{ {
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff", "id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f",
"addr": "10.0.0.12:3901", "role": null,
"addr": "10.0.0.2:3901",
"hostname": "node2",
"isUp": true, "isUp": true,
"lastSeenSecsAgo": 1, "lastSeenSecsAgo": 11,
"hostname": "node2" "draining": true,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
}, },
{ {
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
"addr": "10.0.0.21:3901", "role": {
"id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
"zone": "dc1",
"capacity": 100000000000,
"tags": []
},
"addr": "127.0.0.1:3904",
"hostname": "lindy",
"isUp": true, "isUp": true,
"lastSeenSecsAgo": 7, "lastSeenSecsAgo": 2,
"hostname": "node3" "draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
}, },
{ {
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b", "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"addr": "10.0.0.22:3901", "role": {
"id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"zone": "dc1",
"capacity": 100000000000,
"tags": []
},
"addr": "10.0.0.1:3901",
"hostname": "node1",
"isUp": true, "isUp": true,
"lastSeenSecsAgo": 1, "lastSeenSecsAgo": 3,
"hostname": "node4" "draining": false,
"dataPartition": {
"available": 660270088192,
"total": 873862266880
},
"metadataPartition": {
"available": 660270088192,
"total": 873862266880
}
} }
], ]
"layout": {
"version": 12,
"roles": [
{
"id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
"zone": "dc1",
"capacity": 10737418240,
"tags": [
"node1"
]
},
{
"id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
"zone": "dc1",
"capacity": 10737418240,
"tags": [
"node2"
]
},
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"zone": "dc2",
"capacity": 10737418240,
"tags": [
"node3"
]
}
],
"stagedRoleChanges": [
{
"id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
"remove": false,
"zone": "dc2",
"capacity": 10737418240,
"tags": [
"node4"
]
}
{
"id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
"remove": true,
"zone": null,
"capacity": null,
"tags": null,
}
]
}
} }
``` ```

View file

@ -146,7 +146,7 @@ in a bucket, as the partition key becomes the sort key in the index.
How indexing works: How indexing works:
- Each node keeps a local count of how many items it stores for each partition, - Each node keeps a local count of how many items it stores for each partition,
in a local Sled tree that is updated atomically when an item is modified. in a local database tree that is updated atomically when an item is modified.
- These local counters are asynchronously stored in the index table which is - These local counters are asynchronously stored in the index table which is
a regular Garage table spread in the network. Counters are stored as LWW values, a regular Garage table spread in the network. Counters are stored as LWW values,
so basically the final table will have the following structure: so basically the final table will have the following structure:

View file

@ -168,7 +168,7 @@ let
rootFeatures = if features != null then rootFeatures = if features != null then
features features
else else
([ "garage/bundled-libs" "garage/sled" "garage/lmdb" "garage/sqlite" "garage/k2v" ] ++ (if release then [ ([ "garage/bundled-libs" "garage/lmdb" "garage/sqlite" "garage/k2v" ] ++ (if release then [
"garage/consul-discovery" "garage/consul-discovery"
"garage/kubernetes-discovery" "garage/kubernetes-discovery"
"garage/metrics" "garage/metrics"

View file

@ -6,18 +6,13 @@
garage: garage:
# Can be changed for better performance on certain systems # Can be changed for better performance on certain systems
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0 # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0
dbEngine: "sled" dbEngine: "lmdb"
# Defaults is 1MB # Defaults is 1MB
# An increase can result in better performance in certain scenarios # An increase can result in better performance in certain scenarios
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#block-size # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#block-size
blockSize: "1048576" blockSize: "1048576"
# Tuning parameters for the sled DB engine
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#sled-cache-capacity
sledCacheCapacity: "134217728"
sledFlushEveryMs: "2000"
# Default to 3 replicas, see the replication_mode section at # Default to 3 replicas, see the replication_mode section at
# https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#replication-mode # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#replication-mode
replicationMode: "3" replicationMode: "3"
@ -50,11 +45,6 @@ garage:
block_size = {{ .Values.garage.blockSize }} block_size = {{ .Values.garage.blockSize }}
{{- if eq .Values.garage.dbEngine "sled"}}
sled_cache_capacity = {{ .Values.garage.sledCacheCapacity }}
sled_flush_every_ms = {{ .Values.garage.sledFlushEveryMs }}
{{- end }}
replication_mode = "{{ .Values.garage.replicationMode }}" replication_mode = "{{ .Values.garage.replicationMode }}"
compression_level = {{ .Values.garage.compressionLevel }} compression_level = {{ .Values.garage.compressionLevel }}

View file

@ -82,6 +82,19 @@ if [ -z "$SKIP_AWS" ]; then
exit 1 exit 1
fi fi
aws s3api delete-object --bucket eprouvette --key upload aws s3api delete-object --bucket eprouvette --key upload
echo "🛠️ Test SSE-C with awscli (aws s3)"
SSEC_KEY="u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y="
SSEC_KEY_MD5="jMGbs3GyZkYjJUP6q5jA7g=="
echo "$SSEC_KEY" | base64 -d > /tmp/garage.ssec-key
for idx in {1,2}.rnd; do
aws s3 cp --sse-c AES256 --sse-c-key fileb:///tmp/garage.ssec-key \
"/tmp/garage.$idx" "s3://eprouvette/garage.$idx.aws.sse-c"
aws s3 cp --sse-c AES256 --sse-c-key fileb:///tmp/garage.ssec-key \
"s3://eprouvette/garage.$idx.aws.sse-c" "/tmp/garage.$idx.dl.sse-c"
diff "/tmp/garage.$idx" "/tmp/garage.$idx.dl.sse-c"
aws s3api delete-object --bucket eprouvette --key "garage.$idx.aws.sse-c"
done
fi fi
# S3CMD # S3CMD

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_api" name = "garage_api"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -21,7 +21,9 @@ garage_net.workspace = true
garage_util.workspace = true garage_util.workspace = true
garage_rpc.workspace = true garage_rpc.workspace = true
aes-gcm.workspace = true
argon2.workspace = true argon2.workspace = true
async-compression.workspace = true
async-trait.workspace = true async-trait.workspace = true
base64.workspace = true base64.workspace = true
bytes.workspace = true bytes.workspace = true
@ -41,6 +43,7 @@ futures.workspace = true
futures-util.workspace = true futures-util.workspace = true
tokio.workspace = true tokio.workspace = true
tokio-stream.workspace = true tokio-stream.workspace = true
tokio-util.workspace = true
form_urlencoded.workspace = true form_urlencoded.workspace = true
http.workspace = true http.workspace = true

View file

@ -276,7 +276,7 @@ impl ApiHandler for AdminApiServer {
Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await,
Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await, Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await,
Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await, Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await,
Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await, Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage).await,
// Keys // Keys
Endpoint::ListKeys => handle_list_keys(&self.garage).await, Endpoint::ListKeys => handle_list_keys(&self.garage).await,
Endpoint::GetKeyInfo { Endpoint::GetKeyInfo {

View file

@ -123,7 +123,7 @@ async fn bucket_info_results(
.table .table
.get(&bucket_id, &EmptyKey) .get(&bucket_id, &EmptyKey)
.await? .await?
.map(|x| x.filtered_values(&garage.system.ring.borrow())) .map(|x| x.filtered_values(&garage.system.cluster_layout()))
.unwrap_or_default(); .unwrap_or_default();
let mpu_counters = garage let mpu_counters = garage
@ -131,7 +131,7 @@ async fn bucket_info_results(
.table .table
.get(&bucket_id, &EmptyKey) .get(&bucket_id, &EmptyKey)
.await? .await?
.map(|x| x.filtered_values(&garage.system.ring.borrow())) .map(|x| x.filtered_values(&garage.system.cluster_layout()))
.unwrap_or_default(); .unwrap_or_default();
let mut relevant_keys = HashMap::new(); let mut relevant_keys = HashMap::new();

View file

@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::Arc; use std::sync::Arc;
@ -16,25 +17,95 @@ use crate::admin::error::*;
use crate::helpers::{json_ok_response, parse_json_body}; use crate::helpers::{json_ok_response, parse_json_body};
pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> { pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> {
let layout = garage.system.cluster_layout();
let mut nodes = garage
.system
.get_known_nodes()
.into_iter()
.map(|i| {
(
i.id,
NodeResp {
id: hex::encode(i.id),
addr: Some(i.addr),
hostname: i.status.hostname,
is_up: i.is_up,
last_seen_secs_ago: i.last_seen_secs_ago,
data_partition: i
.status
.data_disk_avail
.map(|(avail, total)| FreeSpaceResp {
available: avail,
total,
}),
metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| {
FreeSpaceResp {
available: avail,
total,
}
}),
..Default::default()
},
)
})
.collect::<HashMap<_, _>>();
for (id, _, role) in layout.current().roles.items().iter() {
if let layout::NodeRoleV(Some(r)) = role {
let role = NodeRoleResp {
id: hex::encode(id),
zone: r.zone.to_string(),
capacity: r.capacity,
tags: r.tags.clone(),
};
match nodes.get_mut(id) {
None => {
nodes.insert(
*id,
NodeResp {
id: hex::encode(id),
role: Some(role),
..Default::default()
},
);
}
Some(n) => {
if n.role.is_none() {
n.role = Some(role);
}
}
}
}
}
for ver in layout.versions.iter().rev().skip(1) {
for (id, _, role) in ver.roles.items().iter() {
if let layout::NodeRoleV(Some(r)) = role {
if !nodes.contains_key(id) && r.capacity.is_some() {
nodes.insert(
*id,
NodeResp {
id: hex::encode(id),
draining: true,
..Default::default()
},
);
}
}
}
}
let mut nodes = nodes.into_values().collect::<Vec<_>>();
nodes.sort_by(|x, y| x.id.cmp(&y.id));
let res = GetClusterStatusResponse { let res = GetClusterStatusResponse {
node: hex::encode(garage.system.id), node: hex::encode(garage.system.id),
garage_version: garage_util::version::garage_version(), garage_version: garage_util::version::garage_version(),
garage_features: garage_util::version::garage_features(), garage_features: garage_util::version::garage_features(),
rust_version: garage_util::version::rust_version(), rust_version: garage_util::version::rust_version(),
db_engine: garage.db.engine(), db_engine: garage.db.engine(),
known_nodes: garage layout_version: layout.current().version,
.system nodes,
.get_known_nodes()
.into_iter()
.map(|i| KnownNodeResp {
id: hex::encode(i.id),
addr: i.addr,
is_up: i.is_up,
last_seen_secs_ago: i.last_seen_secs_ago,
hostname: i.status.hostname,
})
.collect(),
layout: format_cluster_layout(&garage.system.get_cluster_layout()),
}; };
Ok(json_ok_response(&res)?) Ok(json_ok_response(&res)?)
@ -85,13 +156,14 @@ pub async fn handle_connect_cluster_nodes(
} }
pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> { pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> {
let res = format_cluster_layout(&garage.system.get_cluster_layout()); let res = format_cluster_layout(&garage.system.cluster_layout());
Ok(json_ok_response(&res)?) Ok(json_ok_response(&res)?)
} }
fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResponse { fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResponse {
let roles = layout let roles = layout
.current()
.roles .roles
.items() .items()
.iter() .iter()
@ -105,10 +177,12 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let staged_role_changes = layout let staged_role_changes = layout
.staging_roles .staging
.get()
.roles
.items() .items()
.iter() .iter()
.filter(|(k, _, v)| layout.roles.get(k) != Some(v)) .filter(|(k, _, v)| layout.current().roles.get(k) != Some(v))
.map(|(k, _, v)| match &v.0 { .map(|(k, _, v)| match &v.0 {
None => NodeRoleChange { None => NodeRoleChange {
id: hex::encode(k), id: hex::encode(k),
@ -126,7 +200,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp
.collect::<Vec<_>>(); .collect::<Vec<_>>();
GetClusterLayoutResponse { GetClusterLayoutResponse {
version: layout.version, version: layout.current().version,
roles, roles,
staged_role_changes, staged_role_changes,
} }
@ -155,8 +229,8 @@ struct GetClusterStatusResponse {
garage_features: Option<&'static [&'static str]>, garage_features: Option<&'static [&'static str]>,
rust_version: &'static str, rust_version: &'static str,
db_engine: String, db_engine: String,
known_nodes: Vec<KnownNodeResp>, layout_version: u64,
layout: GetClusterLayoutResponse, nodes: Vec<NodeResp>,
} }
#[derive(Serialize)] #[derive(Serialize)]
@ -190,14 +264,27 @@ struct NodeRoleResp {
tags: Vec<String>, tags: Vec<String>,
} }
#[derive(Serialize)] #[derive(Serialize, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct KnownNodeResp { struct FreeSpaceResp {
available: u64,
total: u64,
}
#[derive(Serialize, Default)]
#[serde(rename_all = "camelCase")]
struct NodeResp {
id: String, id: String,
addr: SocketAddr, role: Option<NodeRoleResp>,
addr: Option<SocketAddr>,
hostname: Option<String>,
is_up: bool, is_up: bool,
last_seen_secs_ago: Option<u64>, last_seen_secs_ago: Option<u64>,
hostname: String, draining: bool,
#[serde(skip_serializing_if = "Option::is_none")]
data_partition: Option<FreeSpaceResp>,
#[serde(skip_serializing_if = "Option::is_none")]
metadata_partition: Option<FreeSpaceResp>,
} }
// ---- update functions ---- // ---- update functions ----
@ -208,10 +295,10 @@ pub async fn handle_update_cluster_layout(
) -> Result<Response<ResBody>, Error> { ) -> Result<Response<ResBody>, Error> {
let updates = parse_json_body::<UpdateClusterLayoutRequest, _, Error>(req).await?; let updates = parse_json_body::<UpdateClusterLayoutRequest, _, Error>(req).await?;
let mut layout = garage.system.get_cluster_layout(); let mut layout = garage.system.cluster_layout().clone();
let mut roles = layout.roles.clone(); let mut roles = layout.current().roles.clone();
roles.merge(&layout.staging_roles); roles.merge(&layout.staging.get().roles);
for change in updates { for change in updates {
let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?; let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?;
@ -232,11 +319,17 @@ pub async fn handle_update_cluster_layout(
}; };
layout layout
.staging_roles .staging
.get_mut()
.roles
.merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role)));
} }
garage.system.update_cluster_layout(&layout).await?; garage
.system
.layout_manager
.update_cluster_layout(&layout)
.await?;
let res = format_cluster_layout(&layout); let res = format_cluster_layout(&layout);
Ok(json_ok_response(&res)?) Ok(json_ok_response(&res)?)
@ -246,12 +339,16 @@ pub async fn handle_apply_cluster_layout(
garage: &Arc<Garage>, garage: &Arc<Garage>,
req: Request<IncomingBody>, req: Request<IncomingBody>,
) -> Result<Response<ResBody>, Error> { ) -> Result<Response<ResBody>, Error> {
let param = parse_json_body::<ApplyRevertLayoutRequest, _, Error>(req).await?; let param = parse_json_body::<ApplyLayoutRequest, _, Error>(req).await?;
let layout = garage.system.get_cluster_layout(); let layout = garage.system.cluster_layout().clone();
let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; let (layout, msg) = layout.apply_staged_changes(Some(param.version))?;
garage.system.update_cluster_layout(&layout).await?; garage
.system
.layout_manager
.update_cluster_layout(&layout)
.await?;
let res = ApplyClusterLayoutResponse { let res = ApplyClusterLayoutResponse {
message: msg, message: msg,
@ -262,13 +359,14 @@ pub async fn handle_apply_cluster_layout(
pub async fn handle_revert_cluster_layout( pub async fn handle_revert_cluster_layout(
garage: &Arc<Garage>, garage: &Arc<Garage>,
req: Request<IncomingBody>,
) -> Result<Response<ResBody>, Error> { ) -> Result<Response<ResBody>, Error> {
let param = parse_json_body::<ApplyRevertLayoutRequest, _, Error>(req).await?; let layout = garage.system.cluster_layout().clone();
let layout = layout.revert_staged_changes()?;
let layout = garage.system.get_cluster_layout(); garage
let layout = layout.revert_staged_changes(Some(param.version))?; .system
garage.system.update_cluster_layout(&layout).await?; .layout_manager
.update_cluster_layout(&layout)
.await?;
let res = format_cluster_layout(&layout); let res = format_cluster_layout(&layout);
Ok(json_ok_response(&res)?) Ok(json_ok_response(&res)?)
@ -280,7 +378,7 @@ type UpdateClusterLayoutRequest = Vec<NodeRoleChange>;
#[derive(Deserialize)] #[derive(Deserialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct ApplyRevertLayoutRequest { struct ApplyLayoutRequest {
version: u64, version: u64,
} }

View file

@ -59,9 +59,7 @@ impl CommonError {
pub fn http_status_code(&self) -> StatusCode { pub fn http_status_code(&self) -> StatusCode {
match self { match self {
CommonError::InternalError( CommonError::InternalError(
GarageError::Timeout GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..),
| GarageError::RemoteError(_)
| GarageError::Quorum(_, _, _, _),
) => StatusCode::SERVICE_UNAVAILABLE, ) => StatusCode::SERVICE_UNAVAILABLE,
CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
StatusCode::INTERNAL_SERVER_ERROR StatusCode::INTERNAL_SERVER_ERROR
@ -80,9 +78,7 @@ impl CommonError {
match self { match self {
CommonError::Forbidden(_) => "AccessDenied", CommonError::Forbidden(_) => "AccessDenied",
CommonError::InternalError( CommonError::InternalError(
GarageError::Timeout GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..),
| GarageError::RemoteError(_)
| GarageError::Quorum(_, _, _, _),
) => "ServiceUnavailable", ) => "ServiceUnavailable",
CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
"InternalError" "InternalError"

View file

@ -1,9 +1,6 @@
use std::sync::Arc;
use hyper::Response; use hyper::Response;
use serde::Serialize; use serde::Serialize;
use garage_rpc::ring::Ring;
use garage_table::util::*; use garage_table::util::*;
use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES}; use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES};
@ -27,7 +24,11 @@ pub async fn handle_read_index(
let reverse = reverse.unwrap_or(false); let reverse = reverse.unwrap_or(false);
let ring: Arc<Ring> = garage.system.ring.borrow().clone(); let node_id_vec = garage
.system
.cluster_layout()
.all_nongateway_nodes()
.to_vec();
let (partition_keys, more, next_start) = read_range( let (partition_keys, more, next_start) = read_range(
&garage.k2v.counter_table.table, &garage.k2v.counter_table.table,
@ -36,7 +37,7 @@ pub async fn handle_read_index(
&start, &start,
&end, &end,
limit, limit,
Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), Some((DeletedFilter::NotDeleted, node_id_vec)),
EnumerationOrder::from_reverse(reverse), EnumerationOrder::from_reverse(reverse),
) )
.await?; .await?;
@ -55,7 +56,7 @@ pub async fn handle_read_index(
partition_keys: partition_keys partition_keys: partition_keys
.into_iter() .into_iter()
.map(|part| { .map(|part| {
let vals = part.filtered_values(&ring); let vals = part.filtered_values(&garage.system.cluster_layout());
ReadIndexResponseEntry { ReadIndexResponseEntry {
pk: part.sk, pk: part.sk,
entries: *vals.get(&s_entries).unwrap_or(&0), entries: *vals.get(&s_entries).unwrap_or(&0),

View file

@ -1,7 +1,7 @@
use std::pin::Pin; use std::pin::Pin;
use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::time::{Duration, SystemTime, UNIX_EPOCH};
use futures::{stream, stream::Stream, StreamExt}; use futures::{stream, stream::Stream, StreamExt, TryStreamExt};
use md5::{Digest as Md5Digest, Md5}; use md5::{Digest as Md5Digest, Md5};
use bytes::Bytes; use bytes::Bytes;
@ -9,9 +9,11 @@ use hyper::{Request, Response};
use serde::Serialize; use serde::Serialize;
use garage_net::bytes_buf::BytesBuf; use garage_net::bytes_buf::BytesBuf;
use garage_net::stream::read_stream_to_end;
use garage_rpc::rpc_helper::OrderTag; use garage_rpc::rpc_helper::OrderTag;
use garage_table::*; use garage_table::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error as GarageError;
use garage_util::time::*; use garage_util::time::*;
use garage_model::s3::block_ref_table::*; use garage_model::s3::block_ref_table::*;
@ -21,11 +23,15 @@ use garage_model::s3::version_table::*;
use crate::helpers::*; use crate::helpers::*;
use crate::s3::api_server::{ReqBody, ResBody}; use crate::s3::api_server::{ReqBody, ResBody};
use crate::s3::encryption::EncryptionParams;
use crate::s3::error::*; use crate::s3::error::*;
use crate::s3::get::full_object_byte_stream;
use crate::s3::multipart; use crate::s3::multipart;
use crate::s3::put::get_headers; use crate::s3::put::{get_headers, save_stream, SaveStreamResult};
use crate::s3::xml::{self as s3_xml, xmlns_tag}; use crate::s3::xml::{self as s3_xml, xmlns_tag};
// -------- CopyObject ---------
pub async fn handle_copy( pub async fn handle_copy(
ctx: ReqCtx, ctx: ReqCtx,
req: &Request<ReqBody>, req: &Request<ReqBody>,
@ -35,38 +41,114 @@ pub async fn handle_copy(
let source_object = get_copy_source(&ctx, req).await?; let source_object = get_copy_source(&ctx, req).await?;
let ReqCtx {
garage,
bucket_id: dest_bucket_id,
..
} = ctx;
let (source_version, source_version_data, source_version_meta) = let (source_version, source_version_data, source_version_meta) =
extract_source_info(&source_object)?; extract_source_info(&source_object)?;
// Check precondition, e.g. x-amz-copy-source-if-match // Check precondition, e.g. x-amz-copy-source-if-match
copy_precondition.check(source_version, &source_version_meta.etag)?; copy_precondition.check(source_version, &source_version_meta.etag)?;
// Determine encryption parameters
let (source_encryption, source_object_headers) =
EncryptionParams::check_decrypt_for_copy_source(
&ctx.garage,
req.headers(),
&source_version_meta.encryption,
)?;
let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?;
// Determine headers of destination object
let dest_object_headers = match req.headers().get("x-amz-metadata-directive") {
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => {
get_headers(req.headers())?
}
_ => source_object_headers.into_owned(),
};
// Do actual object copying
let res = if EncryptionParams::is_same(&source_encryption, &dest_encryption) {
// If source and dest are both unencrypted, or if the encryption keys
// are the same, we can just copy the metadata and link blocks of the
// old object from the new object.
handle_copy_metaonly(
ctx,
dest_key,
dest_object_headers,
dest_encryption,
source_version,
source_version_data,
source_version_meta,
)
.await?
} else {
// If source and dest encryption use different keys,
// we must decrypt content and re-encrypt, so rewrite all data blocks.
handle_copy_reencrypt(
ctx,
dest_key,
dest_object_headers,
dest_encryption,
source_version,
source_version_data,
source_encryption,
)
.await?
};
let last_modified = msec_to_rfc3339(res.version_timestamp);
let result = CopyObjectResult {
last_modified: s3_xml::Value(last_modified),
etag: s3_xml::Value(format!("\"{}\"", res.etag)),
};
let xml = s3_xml::to_xml_with_header(&result)?;
let mut resp = Response::builder()
.header("Content-Type", "application/xml")
.header("x-amz-version-id", hex::encode(res.version_uuid))
.header(
"x-amz-copy-source-version-id",
hex::encode(source_version.uuid),
);
dest_encryption.add_response_headers(&mut resp);
Ok(resp.body(string_body(xml))?)
}
async fn handle_copy_metaonly(
ctx: ReqCtx,
dest_key: &str,
dest_object_headers: ObjectVersionHeaders,
dest_encryption: EncryptionParams,
source_version: &ObjectVersion,
source_version_data: &ObjectVersionData,
source_version_meta: &ObjectVersionMeta,
) -> Result<SaveStreamResult, Error> {
let ReqCtx {
garage,
bucket_id: dest_bucket_id,
..
} = ctx;
// Generate parameters for copied object // Generate parameters for copied object
let new_uuid = gen_uuid(); let new_uuid = gen_uuid();
let new_timestamp = now_msec(); let new_timestamp = now_msec();
// Implement x-amz-metadata-directive: REPLACE let new_meta = ObjectVersionMeta {
let new_meta = match req.headers().get("x-amz-metadata-directive") { encryption: dest_encryption.encrypt_headers(dest_object_headers)?,
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta { size: source_version_meta.size,
headers: get_headers(req.headers())?, etag: source_version_meta.etag.clone(),
size: source_version_meta.size,
etag: source_version_meta.etag.clone(),
},
_ => source_version_meta.clone(),
}; };
let etag = new_meta.etag.to_string(); let res = SaveStreamResult {
version_uuid: new_uuid,
version_timestamp: new_timestamp,
etag: new_meta.etag.clone(),
};
// Save object copy // Save object copy
match source_version_data { match source_version_data {
ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::DeleteMarker => unreachable!(),
ObjectVersionData::Inline(_meta, bytes) => { ObjectVersionData::Inline(_meta, bytes) => {
// bytes is either plaintext before&after or encrypted with the
// same keys, so it's ok to just copy it as is
let dest_object_version = ObjectVersion { let dest_object_version = ObjectVersion {
uuid: new_uuid, uuid: new_uuid,
timestamp: new_timestamp, timestamp: new_timestamp,
@ -97,7 +179,7 @@ pub async fn handle_copy(
uuid: new_uuid, uuid: new_uuid,
timestamp: new_timestamp, timestamp: new_timestamp,
state: ObjectVersionState::Uploading { state: ObjectVersionState::Uploading {
headers: new_meta.headers.clone(), encryption: new_meta.encryption.clone(),
multipart: false, multipart: false,
}, },
}; };
@ -164,23 +246,42 @@ pub async fn handle_copy(
} }
} }
let last_modified = msec_to_rfc3339(new_timestamp); Ok(res)
let result = CopyObjectResult {
last_modified: s3_xml::Value(last_modified),
etag: s3_xml::Value(format!("\"{}\"", etag)),
};
let xml = s3_xml::to_xml_with_header(&result)?;
Ok(Response::builder()
.header("Content-Type", "application/xml")
.header("x-amz-version-id", hex::encode(new_uuid))
.header(
"x-amz-copy-source-version-id",
hex::encode(source_version.uuid),
)
.body(string_body(xml))?)
} }
async fn handle_copy_reencrypt(
ctx: ReqCtx,
dest_key: &str,
dest_object_headers: ObjectVersionHeaders,
dest_encryption: EncryptionParams,
source_version: &ObjectVersion,
source_version_data: &ObjectVersionData,
source_encryption: EncryptionParams,
) -> Result<SaveStreamResult, Error> {
// basically we will read the source data (decrypt if necessary)
// and save that in a new object (encrypt if necessary),
// by combining the code used in getobject and putobject
let source_stream = full_object_byte_stream(
ctx.garage.clone(),
source_version,
source_version_data,
source_encryption,
);
save_stream(
&ctx,
dest_object_headers,
dest_encryption,
source_stream.map_err(|e| Error::from(GarageError::from(e))),
&dest_key.to_string(),
None,
None,
)
.await
}
// -------- UploadPartCopy ---------
pub async fn handle_upload_part_copy( pub async fn handle_upload_part_copy(
ctx: ReqCtx, ctx: ReqCtx,
req: &Request<ReqBody>, req: &Request<ReqBody>,
@ -193,7 +294,7 @@ pub async fn handle_upload_part_copy(
let dest_upload_id = multipart::decode_upload_id(upload_id)?; let dest_upload_id = multipart::decode_upload_id(upload_id)?;
let dest_key = dest_key.to_string(); let dest_key = dest_key.to_string();
let (source_object, (_, _, mut dest_mpu)) = futures::try_join!( let (source_object, (_, dest_version, mut dest_mpu)) = futures::try_join!(
get_copy_source(&ctx, req), get_copy_source(&ctx, req),
multipart::get_upload(&ctx, &dest_key, &dest_upload_id) multipart::get_upload(&ctx, &dest_key, &dest_upload_id)
)?; )?;
@ -206,6 +307,20 @@ pub async fn handle_upload_part_copy(
// Check precondition on source, e.g. x-amz-copy-source-if-match // Check precondition on source, e.g. x-amz-copy-source-if-match
copy_precondition.check(source_object_version, &source_version_meta.etag)?; copy_precondition.check(source_object_version, &source_version_meta.etag)?;
// Determine encryption parameters
let (source_encryption, _) = EncryptionParams::check_decrypt_for_copy_source(
&garage,
req.headers(),
&source_version_meta.encryption,
)?;
let dest_object_encryption = match dest_version.state {
ObjectVersionState::Uploading { encryption, .. } => encryption,
_ => unreachable!(),
};
let (dest_encryption, _) =
EncryptionParams::check_decrypt(&garage, req.headers(), &dest_object_encryption)?;
let same_encryption = EncryptionParams::is_same(&source_encryption, &dest_encryption);
// Check source range is valid // Check source range is valid
let source_range = match req.headers().get("x-amz-copy-source-range") { let source_range = match req.headers().get("x-amz-copy-source-range") {
Some(range) => { Some(range) => {
@ -227,21 +342,16 @@ pub async fn handle_upload_part_copy(
}; };
// Check source version is not inlined // Check source version is not inlined
match source_version_data { if matches!(source_version_data, ObjectVersionData::Inline(_, _)) {
ObjectVersionData::DeleteMarker => unreachable!(), // This is only for small files, we don't bother handling this.
ObjectVersionData::Inline(_meta, _bytes) => { // (in AWS UploadPartCopy works for parts at least 5MB which
// This is only for small files, we don't bother handling this. // is never the case of an inline object)
// (in AWS UploadPartCopy works for parts at least 5MB which return Err(Error::bad_request(
// is never the case of an inline object) "Source object is too small (minimum part size is 5Mb)",
return Err(Error::bad_request( ));
"Source object is too small (minimum part size is 5Mb)", }
));
}
ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (),
};
// Fetch source versin with its block list, // Fetch source version with its block list
// and destination version to check part hasn't yet been uploaded
let source_version = garage let source_version = garage
.version_table .version_table
.get(&source_object_version.uuid, &EmptyKey) .get(&source_object_version.uuid, &EmptyKey)
@ -251,7 +361,9 @@ pub async fn handle_upload_part_copy(
// We want to reuse blocks from the source version as much as possible. // We want to reuse blocks from the source version as much as possible.
// However, we still need to get the data from these blocks // However, we still need to get the data from these blocks
// because we need to know it to calculate the MD5sum of the part // because we need to know it to calculate the MD5sum of the part
// which is used as its ETag. // which is used as its ETag. For encrypted sources or destinations,
// we must always read(+decrypt) and then write(+encrypt), so we
// can never reuse data blocks as is.
// First, calculate what blocks we want to keep, // First, calculate what blocks we want to keep,
// and the subrange of the block to take, if the bounds of the // and the subrange of the block to take, if the bounds of the
@ -313,6 +425,8 @@ pub async fn handle_upload_part_copy(
}, },
false, false,
); );
// write an empty version now to be the parent of the block_ref entries
garage.version_table.insert(&dest_version).await?;
// Now, actually copy the blocks // Now, actually copy the blocks
let mut md5hasher = Md5::new(); let mut md5hasher = Md5::new();
@ -321,24 +435,44 @@ pub async fn handle_upload_part_copy(
// and extract the subrange if necessary. // and extract the subrange if necessary.
// The second returned value is an Option<Hash>, that is Some // The second returned value is an Option<Hash>, that is Some
// if and only if the block returned is a block that already existed // if and only if the block returned is a block that already existed
// in the Garage data store (thus we don't need to save it again). // in the Garage data store and can be reused as-is instead of having
// to save it again. This excludes encrypted source blocks that we had
// to decrypt.
let garage2 = garage.clone(); let garage2 = garage.clone();
let order_stream = OrderTag::stream(); let order_stream = OrderTag::stream();
let source_blocks = stream::iter(blocks_to_copy) let source_blocks = stream::iter(blocks_to_copy)
.enumerate() .enumerate()
.flat_map(|(i, (block_hash, range_to_copy))| { .map(|(i, (block_hash, range_to_copy))| {
let garage3 = garage2.clone(); let garage3 = garage2.clone();
stream::once(async move { async move {
let data = garage3 let stream = source_encryption
.block_manager .get_block(&garage3, &block_hash, Some(order_stream.order(i as u64)))
.rpc_get_block(&block_hash, Some(order_stream.order(i as u64)))
.await?; .await?;
let data = read_stream_to_end(stream).await?.into_bytes();
// For each item, we return a tuple of:
// 1. the full data block (decrypted)
// 2. an Option<Hash> that indicates the hash of the block in the block store,
// only if it can be re-used as-is in the copied object
match range_to_copy { match range_to_copy {
Some(r) => Ok((data.slice(r), None)), Some(r) => {
None => Ok((data, Some(block_hash))), // If we are taking a subslice of the data, we cannot reuse the block as-is
Ok((data.slice(r), None))
}
None if same_encryption => {
// If the data is unencrypted before & after, or if we are using
// the same encryption key, we can reuse the stored block, no need
// to re-send it to storage nodes.
Ok((data, Some(block_hash)))
}
None => {
// If we are decrypting / (re)encrypting with different keys,
// we cannot reuse the block as-is
Ok((data, None))
}
} }
}) }
}) })
.buffered(2)
.peekable(); .peekable();
// The defragmenter is a custom stream (defined below) that concatenates // The defragmenter is a custom stream (defined below) that concatenates
@ -346,22 +480,33 @@ pub async fn handle_upload_part_copy(
// It returns a series of (Vec<u8>, Option<Hash>). // It returns a series of (Vec<u8>, Option<Hash>).
// When it is done, it returns an empty vec. // When it is done, it returns an empty vec.
// Same as the previous iterator, the Option is Some(_) if and only if // Same as the previous iterator, the Option is Some(_) if and only if
// it's an existing block of the Garage data store. // it's an existing block of the Garage data store that can be reused.
let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks)); let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks));
let mut current_offset = 0; let mut current_offset = 0;
let mut next_block = defragmenter.next().await?; let mut next_block = defragmenter.next().await?;
// TODO this could be optimized similarly to read_and_put_blocks
// low priority because uploadpartcopy is rarely used
loop { loop {
let (data, existing_block_hash) = next_block; let (data, existing_block_hash) = next_block;
if data.is_empty() { if data.is_empty() {
break; break;
} }
let data_len = data.len() as u64;
md5hasher.update(&data[..]); md5hasher.update(&data[..]);
let must_upload = existing_block_hash.is_none(); let (final_data, must_upload, final_hash) = match existing_block_hash {
let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); Some(hash) if same_encryption => (data, false, hash),
_ => tokio::task::spawn_blocking(move || {
let data_enc = dest_encryption.encrypt_block(data)?;
let hash = blake2sum(&data_enc);
Ok::<_, Error>((data_enc, true, hash))
})
.await
.unwrap()?,
};
dest_version.blocks.clear(); dest_version.blocks.clear();
dest_version.blocks.put( dest_version.blocks.put(
@ -371,10 +516,10 @@ pub async fn handle_upload_part_copy(
}, },
VersionBlock { VersionBlock {
hash: final_hash, hash: final_hash,
size: data.len() as u64, size: data_len,
}, },
); );
current_offset += data.len() as u64; current_offset += data_len;
let block_ref = BlockRef { let block_ref = BlockRef {
block: final_hash, block: final_hash,
@ -382,36 +527,33 @@ pub async fn handle_upload_part_copy(
deleted: false.into(), deleted: false.into(),
}; };
let garage2 = garage.clone(); let (_, _, _, next) = futures::try_join!(
let res = futures::try_join!(
// Thing 1: if the block is not exactly a block that existed before, // Thing 1: if the block is not exactly a block that existed before,
// we need to insert that data as a new block. // we need to insert that data as a new block.
async move { async {
if must_upload { if must_upload {
garage2 garage
.block_manager .block_manager
.rpc_put_block(final_hash, data, None) .rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None)
.await .await
} else { } else {
Ok(()) Ok(())
} }
}, },
async { // Thing 2: we need to insert the block in the version
// Thing 2: we need to insert the block in the version garage.version_table.insert(&dest_version),
garage.version_table.insert(&dest_version).await?; // Thing 3: we need to add a block reference
// Thing 3: we need to add a block reference garage.block_ref_table.insert(&block_ref),
garage.block_ref_table.insert(&block_ref).await // Thing 4: we need to read the next block
},
// Thing 4: we need to prefetch the next block
defragmenter.next(), defragmenter.next(),
)?; )?;
next_block = res.2; next_block = next;
} }
assert_eq!(current_offset, source_range.length); assert_eq!(current_offset, source_range.length);
let data_md5sum = md5hasher.finalize(); let data_md5sum = md5hasher.finalize();
let etag = hex::encode(data_md5sum); let etag = dest_encryption.etag_from_md5(&data_md5sum);
// Put the part's ETag in the Versiontable // Put the part's ETag in the Versiontable
dest_mpu.parts.put( dest_mpu.parts.put(
@ -431,13 +573,14 @@ pub async fn handle_upload_part_copy(
last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)), last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)),
})?; })?;
Ok(Response::builder() let mut resp = Response::builder()
.header("Content-Type", "application/xml") .header("Content-Type", "application/xml")
.header( .header(
"x-amz-copy-source-version-id", "x-amz-copy-source-version-id",
hex::encode(source_object_version.uuid), hex::encode(source_object_version.uuid),
) );
.body(string_body(resp_xml))?) dest_encryption.add_response_headers(&mut resp);
Ok(resp.body(string_body(resp_xml))?)
} }
async fn get_copy_source(ctx: &ReqCtx, req: &Request<ReqBody>) -> Result<Object, Error> { async fn get_copy_source(ctx: &ReqCtx, req: &Request<ReqBody>) -> Result<Object, Error> {

595
src/api/s3/encryption.rs Normal file
View file

@ -0,0 +1,595 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::pin::Pin;
use aes_gcm::{
aead::stream::{DecryptorLE31, EncryptorLE31, StreamLE31},
aead::{Aead, AeadCore, KeyInit, OsRng},
aes::cipher::crypto_common::rand_core::RngCore,
aes::cipher::typenum::Unsigned,
Aes256Gcm, Key, Nonce,
};
use base64::prelude::*;
use bytes::Bytes;
use futures::stream::Stream;
use futures::task;
use tokio::io::BufReader;
use http::header::{HeaderMap, HeaderName, HeaderValue};
use garage_net::bytes_buf::BytesBuf;
use garage_net::stream::{stream_asyncread, ByteStream};
use garage_rpc::rpc_helper::OrderTag;
use garage_util::data::Hash;
use garage_util::error::Error as GarageError;
use garage_util::migrate::Migrate;
use garage_model::garage::Garage;
use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionHeaders};
use crate::common_error::*;
use crate::s3::error::Error;
const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName =
HeaderName::from_static("x-amz-server-side-encryption-customer-algorithm");
const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName =
HeaderName::from_static("x-amz-server-side-encryption-customer-key");
const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName =
HeaderName::from_static("x-amz-server-side-encryption-customer-key-md5");
const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName =
HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-algorithm");
const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName =
HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key");
const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName =
HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key-md5");
const CUSTOMER_ALGORITHM_AES256: &[u8] = b"AES256";
type Md5Output = md5::digest::Output<md5::Md5Core>;
type StreamNonceSize = aes_gcm::aead::stream::NonceSize<Aes256Gcm, StreamLE31<Aes256Gcm>>;
// Data blocks are encrypted by smaller chunks of size 4096 bytes,
// so that data can be streamed when reading.
// This size has to be known and has to be constant, or data won't be
// readable anymore. DO NOT CHANGE THIS VALUE.
const STREAM_ENC_PLAIN_CHUNK_SIZE: usize = 0x1000; // 4096 bytes
const STREAM_ENC_CYPER_CHUNK_SIZE: usize = STREAM_ENC_PLAIN_CHUNK_SIZE + 16;
#[derive(Clone, Copy)]
pub enum EncryptionParams {
Plaintext,
SseC {
client_key: Key<Aes256Gcm>,
client_key_md5: Md5Output,
compression_level: Option<i32>,
},
}
impl EncryptionParams {
pub fn is_encrypted(&self) -> bool {
!matches!(self, Self::Plaintext)
}
pub fn is_same(a: &Self, b: &Self) -> bool {
let relevant_info = |x: &Self| match x {
Self::Plaintext => None,
Self::SseC {
client_key,
compression_level,
..
} => Some((*client_key, compression_level.is_some())),
};
relevant_info(a) == relevant_info(b)
}
pub fn new_from_headers(
garage: &Garage,
headers: &HeaderMap,
) -> Result<EncryptionParams, Error> {
let key = parse_request_headers(
headers,
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
)?;
match key {
Some((client_key, client_key_md5)) => Ok(EncryptionParams::SseC {
client_key,
client_key_md5,
compression_level: garage.config.compression_level,
}),
None => Ok(EncryptionParams::Plaintext),
}
}
pub fn add_response_headers(&self, resp: &mut http::response::Builder) {
if let Self::SseC { client_key_md5, .. } = self {
let md5 = BASE64_STANDARD.encode(&client_key_md5);
resp.headers_mut().unwrap().insert(
X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
HeaderValue::from_bytes(CUSTOMER_ALGORITHM_AES256).unwrap(),
);
resp.headers_mut().unwrap().insert(
X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
HeaderValue::from_bytes(md5.as_bytes()).unwrap(),
);
}
}
pub fn check_decrypt<'a>(
garage: &Garage,
headers: &HeaderMap,
obj_enc: &'a ObjectVersionEncryption,
) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> {
let key = parse_request_headers(
headers,
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
&X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
)?;
Self::check_decrypt_common(garage, key, obj_enc)
}
pub fn check_decrypt_for_copy_source<'a>(
garage: &Garage,
headers: &HeaderMap,
obj_enc: &'a ObjectVersionEncryption,
) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> {
let key = parse_request_headers(
headers,
&X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
&X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
&X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
)?;
Self::check_decrypt_common(garage, key, obj_enc)
}
fn check_decrypt_common<'a>(
garage: &Garage,
key: Option<(Key<Aes256Gcm>, Md5Output)>,
obj_enc: &'a ObjectVersionEncryption,
) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> {
match (key, &obj_enc) {
(
Some((client_key, client_key_md5)),
ObjectVersionEncryption::SseC {
headers,
compressed,
},
) => {
let enc = Self::SseC {
client_key,
client_key_md5,
compression_level: if *compressed {
Some(garage.config.compression_level.unwrap_or(1))
} else {
None
},
};
let plaintext = enc.decrypt_blob(&headers)?;
let headers = ObjectVersionHeaders::decode(&plaintext)
.ok_or_internal_error("Could not decode encrypted headers")?;
Ok((enc, Cow::Owned(headers)))
}
(None, ObjectVersionEncryption::Plaintext { headers }) => {
Ok((Self::Plaintext, Cow::Borrowed(headers)))
}
(_, ObjectVersionEncryption::SseC { .. }) => {
Err(Error::bad_request("Object is encrypted"))
}
(Some(_), _) => {
// TODO: should this be an OK scenario?
Err(Error::bad_request("Trying to decrypt a plaintext object"))
}
}
}
pub fn encrypt_headers(
&self,
h: ObjectVersionHeaders,
) -> Result<ObjectVersionEncryption, Error> {
match self {
Self::SseC {
compression_level, ..
} => {
let plaintext = h.encode().map_err(GarageError::from)?;
let ciphertext = self.encrypt_blob(&plaintext)?;
Ok(ObjectVersionEncryption::SseC {
headers: ciphertext.into_owned(),
compressed: compression_level.is_some(),
})
}
Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { headers: h }),
}
}
// ---- generating object Etag values ----
pub fn etag_from_md5(&self, md5sum: &[u8]) -> String {
match self {
Self::Plaintext => hex::encode(md5sum),
Self::SseC { .. } => {
// AWS specifies that for encrypted objects, the Etag is not
// the md5sum of the data, but doesn't say what it is.
// So we just put some random bytes.
let mut random = [0u8; 16];
OsRng.fill_bytes(&mut random);
hex::encode(&random)
}
}
}
// ---- generic function for encrypting / decrypting blobs ----
// Prepends a randomly-generated nonce to the encrypted value.
// This is used for encrypting object headers and inlined data for small objects.
// This does not compress anything.
pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result<Cow<'a, [u8]>, Error> {
match self {
Self::SseC { client_key, .. } => {
let cipher = Aes256Gcm::new(&client_key);
let nonce = Aes256Gcm::generate_nonce(&mut OsRng);
let ciphertext = cipher
.encrypt(&nonce, blob)
.ok_or_internal_error("Encryption failed")?;
Ok(Cow::Owned([nonce.to_vec(), ciphertext].concat()))
}
Self::Plaintext => Ok(Cow::Borrowed(blob)),
}
}
pub fn decrypt_blob<'a>(&self, blob: &'a [u8]) -> Result<Cow<'a, [u8]>, Error> {
match self {
Self::SseC { client_key, .. } => {
let cipher = Aes256Gcm::new(&client_key);
let nonce_size = <Aes256Gcm as AeadCore>::NonceSize::to_usize();
let nonce = Nonce::from_slice(
blob.get(..nonce_size)
.ok_or_internal_error("invalid encrypted data")?,
);
let plaintext = cipher
.decrypt(nonce, &blob[nonce_size..])
.ok_or_bad_request(
"Invalid encryption key, could not decrypt object metadata.",
)?;
Ok(Cow::Owned(plaintext))
}
Self::Plaintext => Ok(Cow::Borrowed(blob)),
}
}
// ---- function for encrypting / decrypting byte streams ----
/// Get a data block from the storage node, and decrypt+decompress it
/// if necessary. If object is plaintext, just get it without any processing.
pub async fn get_block(
&self,
garage: &Garage,
hash: &Hash,
order: Option<OrderTag>,
) -> Result<ByteStream, GarageError> {
let raw_block = garage
.block_manager
.rpc_get_block_streaming(hash, order)
.await?;
Ok(self.decrypt_block_stream(raw_block))
}
pub fn decrypt_block_stream(&self, stream: ByteStream) -> ByteStream {
match self {
Self::Plaintext => stream,
Self::SseC {
client_key,
compression_level,
..
} => {
let plaintext = DecryptStream::new(stream, *client_key);
if compression_level.is_some() {
let reader = stream_asyncread(Box::pin(plaintext));
let reader = BufReader::new(reader);
let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader);
Box::pin(tokio_util::io::ReaderStream::new(reader))
} else {
Box::pin(plaintext)
}
}
}
}
/// Encrypt a data block if encryption is set, for use before
/// putting the data blocks into storage
pub fn encrypt_block(&self, block: Bytes) -> Result<Bytes, Error> {
match self {
Self::Plaintext => Ok(block),
Self::SseC {
client_key,
compression_level,
..
} => {
let block = if let Some(level) = compression_level {
Cow::Owned(
garage_block::zstd_encode(block.as_ref(), *level)
.ok_or_internal_error("failed to compress data block")?,
)
} else {
Cow::Borrowed(block.as_ref())
};
let mut ret = Vec::with_capacity(block.len() + 32 + block.len() / 64);
let mut nonce: Nonce<StreamNonceSize> = Default::default();
OsRng.fill_bytes(&mut nonce);
ret.extend_from_slice(nonce.as_slice());
let mut cipher = EncryptorLE31::<Aes256Gcm>::new(&client_key, &nonce);
let mut iter = block.chunks(STREAM_ENC_PLAIN_CHUNK_SIZE).peekable();
if iter.peek().is_none() {
// Empty stream: we encrypt an empty last chunk
let chunk_enc = cipher
.encrypt_last(&[][..])
.ok_or_internal_error("failed to encrypt chunk")?;
ret.extend_from_slice(&chunk_enc);
} else {
loop {
let chunk = iter.next().unwrap();
if iter.peek().is_some() {
let chunk_enc = cipher
.encrypt_next(chunk)
.ok_or_internal_error("failed to encrypt chunk")?;
assert_eq!(chunk.len(), STREAM_ENC_PLAIN_CHUNK_SIZE);
assert_eq!(chunk_enc.len(), STREAM_ENC_CYPER_CHUNK_SIZE);
ret.extend_from_slice(&chunk_enc);
} else {
// use encrypt_last for the last chunk
let chunk_enc = cipher
.encrypt_last(chunk)
.ok_or_internal_error("failed to encrypt chunk")?;
ret.extend_from_slice(&chunk_enc);
break;
}
}
}
Ok(ret.into())
}
}
}
}
fn parse_request_headers(
headers: &HeaderMap,
alg_header: &HeaderName,
key_header: &HeaderName,
md5_header: &HeaderName,
) -> Result<Option<(Key<Aes256Gcm>, Md5Output)>, Error> {
let alg = headers.get(alg_header).map(HeaderValue::as_bytes);
let key = headers.get(key_header).map(HeaderValue::as_bytes);
let md5 = headers.get(md5_header).map(HeaderValue::as_bytes);
match alg {
Some(CUSTOMER_ALGORITHM_AES256) => {
use md5::{Digest, Md5};
let key_b64 =
key.ok_or_bad_request("Missing server-side-encryption-customer-key header")?;
let key_bytes: [u8; 32] = BASE64_STANDARD
.decode(&key_b64)
.ok_or_bad_request(
"Invalid server-side-encryption-customer-key header: invalid base64",
)?
.try_into()
.ok()
.ok_or_bad_request(
"Invalid server-side-encryption-customer-key header: invalid length",
)?;
let md5_b64 =
md5.ok_or_bad_request("Missing server-side-encryption-customer-key-md5 header")?;
let md5_bytes = BASE64_STANDARD.decode(&md5_b64).ok_or_bad_request(
"Invalid server-side-encryption-customer-key-md5 header: invalid bass64",
)?;
let mut hasher = Md5::new();
hasher.update(&key_bytes[..]);
let our_md5 = hasher.finalize();
if our_md5.as_slice() != md5_bytes.as_slice() {
return Err(Error::bad_request(
"Server-side encryption client key MD5 checksum does not match",
));
}
Ok(Some((key_bytes.into(), our_md5)))
}
Some(alg) => Err(Error::InvalidEncryptionAlgorithm(
String::from_utf8_lossy(alg).into_owned(),
)),
None => {
if key.is_some() || md5.is_some() {
Err(Error::bad_request(
"Unexpected server-side-encryption-customer-key{,-md5} header(s)",
))
} else {
Ok(None)
}
}
}
}
// ---- encrypt & decrypt streams ----
#[pin_project::pin_project]
struct DecryptStream {
#[pin]
stream: ByteStream,
done_reading: bool,
buf: BytesBuf,
key: Key<Aes256Gcm>,
state: DecryptStreamState,
}
enum DecryptStreamState {
Starting,
Running(DecryptorLE31<Aes256Gcm>),
Done,
}
impl DecryptStream {
fn new(stream: ByteStream, key: Key<Aes256Gcm>) -> Self {
Self {
stream,
done_reading: false,
buf: BytesBuf::new(),
key,
state: DecryptStreamState::Starting,
}
}
}
impl Stream for DecryptStream {
type Item = Result<Bytes, std::io::Error>;
fn poll_next(
self: Pin<&mut Self>,
cx: &mut task::Context<'_>,
) -> task::Poll<Option<Self::Item>> {
use std::task::Poll;
let mut this = self.project();
// The first bytes of the stream should contain the starting nonce.
// If we don't have a Running state, it means that we haven't
// yet read the nonce.
while matches!(this.state, DecryptStreamState::Starting) {
let nonce_size = StreamNonceSize::to_usize();
if let Some(nonce) = this.buf.take_exact(nonce_size) {
let nonce = Nonce::from_slice(nonce.as_ref());
*this.state = DecryptStreamState::Running(DecryptorLE31::new(&this.key, nonce));
break;
}
match futures::ready!(this.stream.as_mut().poll_next(cx)) {
Some(Ok(bytes)) => {
this.buf.extend(bytes);
}
Some(Err(e)) => {
return Poll::Ready(Some(Err(e)));
}
None => {
return Poll::Ready(Some(Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"Decrypt: unexpected EOF, could not read nonce",
))));
}
}
}
// Read at least one byte more than the encrypted chunk size
// (if possible), so that we know if we are decrypting the
// last chunk or not.
while !*this.done_reading && this.buf.len() <= STREAM_ENC_CYPER_CHUNK_SIZE {
match futures::ready!(this.stream.as_mut().poll_next(cx)) {
Some(Ok(bytes)) => {
this.buf.extend(bytes);
}
Some(Err(e)) => {
return Poll::Ready(Some(Err(e)));
}
None => {
*this.done_reading = true;
break;
}
}
}
if matches!(this.state, DecryptStreamState::Done) {
if !this.buf.is_empty() {
return Poll::Ready(Some(Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Decrypt: unexpected bytes after last encrypted chunk",
))));
}
return Poll::Ready(None);
}
let res = if this.buf.len() > STREAM_ENC_CYPER_CHUNK_SIZE {
// we have strictly more bytes than the encrypted chunk size,
// so we know this is not the last
let DecryptStreamState::Running(ref mut cipher) = this.state else {
unreachable!()
};
let chunk = this.buf.take_exact(STREAM_ENC_CYPER_CHUNK_SIZE).unwrap();
let chunk_dec = cipher.decrypt_next(chunk.as_ref());
if let Ok(c) = &chunk_dec {
assert_eq!(c.len(), STREAM_ENC_PLAIN_CHUNK_SIZE);
}
chunk_dec
} else {
// We have one encrypted chunk size or less, even though we tried
// to read more, so this is the last chunk. Decrypt using the
// appropriate decrypt_last() function that then destroys the cipher.
let state = std::mem::replace(this.state, DecryptStreamState::Done);
let DecryptStreamState::Running(cipher) = state else {
unreachable!()
};
let chunk = this.buf.take_all();
cipher.decrypt_last(chunk.as_ref())
};
match res {
Ok(bytes) if bytes.is_empty() => Poll::Ready(None),
Ok(bytes) => Poll::Ready(Some(Ok(bytes.into()))),
Err(_) => Poll::Ready(Some(Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Decryption failed",
)))),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use futures::stream::StreamExt;
use garage_net::stream::read_stream_to_end;
fn stream() -> ByteStream {
Box::pin(
futures::stream::iter(16usize..1024)
.map(|i| Ok(Bytes::from(vec![(i % 256) as u8; (i * 37) % 1024]))),
)
}
async fn test_block_enc(compression_level: Option<i32>) {
let enc = EncryptionParams::SseC {
client_key: Aes256Gcm::generate_key(&mut OsRng),
client_key_md5: Default::default(), // not needed
compression_level,
};
let block_plain = read_stream_to_end(stream()).await.unwrap().into_bytes();
let block_enc = enc.encrypt_block(block_plain.clone()).unwrap();
let block_dec =
enc.decrypt_block_stream(Box::pin(futures::stream::once(async { Ok(block_enc) })));
let block_dec = read_stream_to_end(block_dec).await.unwrap().into_bytes();
assert_eq!(block_plain, block_dec);
assert!(block_dec.len() > 128000);
}
#[tokio::test]
async fn test_encrypt_block() {
test_block_enc(None).await
}
#[tokio::test]
async fn test_encrypt_block_compressed() {
test_block_enc(Some(1)).await
}
}

View file

@ -65,6 +65,10 @@ pub enum Error {
#[error(display = "Invalid HTTP range: {:?}", _0)] #[error(display = "Invalid HTTP range: {:?}", _0)]
InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)), InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)),
/// The client sent a range header with invalid value
#[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)]
InvalidEncryptionAlgorithm(String),
/// The client sent a request for an action not supported by garage /// The client sent a request for an action not supported by garage
#[error(display = "Unimplemented action: {}", _0)] #[error(display = "Unimplemented action: {}", _0)]
NotImplemented(String), NotImplemented(String),
@ -126,6 +130,7 @@ impl Error {
Error::InvalidXml(_) => "MalformedXML", Error::InvalidXml(_) => "MalformedXML",
Error::InvalidRange(_) => "InvalidRange", Error::InvalidRange(_) => "InvalidRange",
Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest", Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest",
Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError",
} }
} }
} }
@ -143,6 +148,7 @@ impl ApiError for Error {
| Error::InvalidPart | Error::InvalidPart
| Error::InvalidPartOrder | Error::InvalidPartOrder
| Error::EntityTooSmall | Error::EntityTooSmall
| Error::InvalidEncryptionAlgorithm(_)
| Error::InvalidXml(_) | Error::InvalidXml(_)
| Error::InvalidUtf8Str(_) | Error::InvalidUtf8Str(_)
| Error::InvalidUtf8String(_) => StatusCode::BAD_REQUEST, | Error::InvalidUtf8String(_) => StatusCode::BAD_REQUEST,

View file

@ -1,10 +1,12 @@
//! Function related to GET and HEAD requests //! Function related to GET and HEAD requests
use std::collections::BTreeMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, UNIX_EPOCH}; use std::time::{Duration, UNIX_EPOCH};
use bytes::Bytes;
use futures::future; use futures::future;
use futures::stream::{self, StreamExt}; use futures::stream::{self, Stream, StreamExt};
use http::header::{ use http::header::{
ACCEPT_RANGES, CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, ACCEPT_RANGES, CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE,
CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, EXPIRES, IF_MODIFIED_SINCE, IF_NONE_MATCH, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, EXPIRES, IF_MODIFIED_SINCE, IF_NONE_MATCH,
@ -25,6 +27,7 @@ use garage_model::s3::version_table::*;
use crate::helpers::*; use crate::helpers::*;
use crate::s3::api_server::ResBody; use crate::s3::api_server::ResBody;
use crate::s3::encryption::EncryptionParams;
use crate::s3::error::*; use crate::s3::error::*;
const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count";
@ -42,6 +45,8 @@ pub struct GetObjectOverrides {
fn object_headers( fn object_headers(
version: &ObjectVersion, version: &ObjectVersion,
version_meta: &ObjectVersionMeta, version_meta: &ObjectVersionMeta,
headers: &ObjectVersionHeaders,
encryption: EncryptionParams,
) -> http::response::Builder { ) -> http::response::Builder {
debug!("Version meta: {:?}", version_meta); debug!("Version meta: {:?}", version_meta);
@ -49,7 +54,6 @@ fn object_headers(
let date_str = httpdate::fmt_http_date(date); let date_str = httpdate::fmt_http_date(date);
let mut resp = Response::builder() let mut resp = Response::builder()
.header(CONTENT_TYPE, version_meta.headers.content_type.to_string())
.header(LAST_MODIFIED, date_str) .header(LAST_MODIFIED, date_str)
.header(ACCEPT_RANGES, "bytes".to_string()); .header(ACCEPT_RANGES, "bytes".to_string());
@ -57,10 +61,27 @@ fn object_headers(
resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag));
} }
for (k, v) in version_meta.headers.other.iter() { // When metadata is retrieved through the REST API, Amazon S3 combines headers that
resp = resp.header(k, v.to_string()); // have the same name (ignoring case) into a comma-delimited list.
// See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html
let mut headers_by_name = BTreeMap::new();
for (name, value) in headers.0.iter() {
match headers_by_name.get_mut(name) {
None => {
headers_by_name.insert(name, vec![value.as_str()]);
}
Some(headers) => {
headers.push(value.as_str());
}
}
} }
for (name, values) in headers_by_name {
resp = resp.header(name, values.join(","));
}
encryption.add_response_headers(&mut resp);
resp resp
} }
@ -175,21 +196,27 @@ pub async fn handle_head_without_ctx(
return Ok(cached); return Ok(cached);
} }
let (encryption, headers) =
EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?;
if let Some(pn) = part_number { if let Some(pn) = part_number {
match version_data { match version_data {
ObjectVersionData::Inline(_, bytes) => { ObjectVersionData::Inline(_, _) => {
if pn != 1 { if pn != 1 {
return Err(Error::InvalidPart); return Err(Error::InvalidPart);
} }
Ok(object_headers(object_version, version_meta) let bytes_len = version_meta.size;
.header(CONTENT_LENGTH, format!("{}", bytes.len())) Ok(
.header( object_headers(object_version, version_meta, &headers, encryption)
CONTENT_RANGE, .header(CONTENT_LENGTH, format!("{}", bytes_len))
format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), .header(
) CONTENT_RANGE,
.header(X_AMZ_MP_PARTS_COUNT, "1") format!("bytes 0-{}/{}", bytes_len - 1, bytes_len),
.status(StatusCode::PARTIAL_CONTENT) )
.body(empty_body())?) .header(X_AMZ_MP_PARTS_COUNT, "1")
.status(StatusCode::PARTIAL_CONTENT)
.body(empty_body())?,
)
} }
ObjectVersionData::FirstBlock(_, _) => { ObjectVersionData::FirstBlock(_, _) => {
let version = garage let version = garage
@ -201,28 +228,32 @@ pub async fn handle_head_without_ctx(
let (part_offset, part_end) = let (part_offset, part_end) =
calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?;
Ok(object_headers(object_version, version_meta) Ok(
.header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) object_headers(object_version, version_meta, &headers, encryption)
.header( .header(CONTENT_LENGTH, format!("{}", part_end - part_offset))
CONTENT_RANGE, .header(
format!( CONTENT_RANGE,
"bytes {}-{}/{}", format!(
part_offset, "bytes {}-{}/{}",
part_end - 1, part_offset,
version_meta.size part_end - 1,
), version_meta.size
) ),
.header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) )
.status(StatusCode::PARTIAL_CONTENT) .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?))
.body(empty_body())?) .status(StatusCode::PARTIAL_CONTENT)
.body(empty_body())?,
)
} }
_ => unreachable!(), _ => unreachable!(),
} }
} else { } else {
Ok(object_headers(object_version, version_meta) Ok(
.header(CONTENT_LENGTH, format!("{}", version_meta.size)) object_headers(object_version, version_meta, &headers, encryption)
.status(StatusCode::OK) .header(CONTENT_LENGTH, format!("{}", version_meta.size))
.body(empty_body())?) .status(StatusCode::OK)
.body(empty_body())?,
)
} }
} }
@ -273,23 +304,41 @@ pub async fn handle_get_without_ctx(
return Ok(cached); return Ok(cached);
} }
let (enc, headers) =
EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?;
match (part_number, parse_range_header(req, last_v_meta.size)?) { match (part_number, parse_range_header(req, last_v_meta.size)?) {
(Some(_), Some(_)) => Err(Error::bad_request( (Some(_), Some(_)) => Err(Error::bad_request(
"Cannot specify both partNumber and Range header", "Cannot specify both partNumber and Range header",
)), )),
(Some(pn), None) => handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await, (Some(pn), None) => {
handle_get_part(garage, last_v, last_v_data, last_v_meta, enc, &headers, pn).await
}
(None, Some(range)) => { (None, Some(range)) => {
handle_get_range( handle_get_range(
garage, garage,
last_v, last_v,
last_v_data, last_v_data,
last_v_meta, last_v_meta,
enc,
&headers,
range.start, range.start,
range.start + range.length, range.start + range.length,
) )
.await .await
} }
(None, None) => handle_get_full(garage, last_v, last_v_data, last_v_meta, overrides).await, (None, None) => {
handle_get_full(
garage,
last_v,
last_v_data,
last_v_meta,
enc,
&headers,
overrides,
)
.await
}
} }
} }
@ -298,17 +347,36 @@ async fn handle_get_full(
version: &ObjectVersion, version: &ObjectVersion,
version_data: &ObjectVersionData, version_data: &ObjectVersionData,
version_meta: &ObjectVersionMeta, version_meta: &ObjectVersionMeta,
encryption: EncryptionParams,
headers: &ObjectVersionHeaders,
overrides: GetObjectOverrides, overrides: GetObjectOverrides,
) -> Result<Response<ResBody>, Error> { ) -> Result<Response<ResBody>, Error> {
let mut resp_builder = object_headers(version, version_meta) let mut resp_builder = object_headers(version, version_meta, &headers, encryption)
.header(CONTENT_LENGTH, format!("{}", version_meta.size)) .header(CONTENT_LENGTH, format!("{}", version_meta.size))
.status(StatusCode::OK); .status(StatusCode::OK);
getobject_override_headers(overrides, &mut resp_builder)?; getobject_override_headers(overrides, &mut resp_builder)?;
let stream = full_object_byte_stream(garage, version, version_data, encryption);
Ok(resp_builder.body(response_body_from_stream(stream))?)
}
pub fn full_object_byte_stream(
garage: Arc<Garage>,
version: &ObjectVersion,
version_data: &ObjectVersionData,
encryption: EncryptionParams,
) -> ByteStream {
match &version_data { match &version_data {
ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::DeleteMarker => unreachable!(),
ObjectVersionData::Inline(_, bytes) => { ObjectVersionData::Inline(_, bytes) => {
Ok(resp_builder.body(bytes_body(bytes.to_vec().into()))?) let bytes = bytes.to_vec();
Box::pin(futures::stream::once(async move {
encryption
.decrypt_blob(&bytes)
.map(|x| Bytes::from(x.to_vec()))
.map_err(std_error_from_read_error)
}))
} }
ObjectVersionData::FirstBlock(_, first_block_hash) => { ObjectVersionData::FirstBlock(_, first_block_hash) => {
let (tx, rx) = mpsc::channel::<ByteStream>(2); let (tx, rx) = mpsc::channel::<ByteStream>(2);
@ -324,19 +392,18 @@ async fn handle_get_full(
garage2.version_table.get(&version_uuid, &EmptyKey).await garage2.version_table.get(&version_uuid, &EmptyKey).await
}); });
let stream_block_0 = garage let stream_block_0 = encryption
.block_manager .get_block(&garage, &first_block_hash, Some(order_stream.order(0)))
.rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0)))
.await?; .await?;
tx.send(stream_block_0) tx.send(stream_block_0)
.await .await
.ok_or_message("channel closed")?; .ok_or_message("channel closed")?;
let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?;
for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) {
let stream_block_i = garage let stream_block_i = encryption
.block_manager .get_block(&garage, &vb.hash, Some(order_stream.order(i as u64)))
.rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64)))
.await?; .await?;
tx.send(stream_block_i) tx.send(stream_block_i)
.await .await
@ -354,8 +421,7 @@ async fn handle_get_full(
} }
}); });
let body = response_body_from_block_stream(rx); Box::pin(tokio_stream::wrappers::ReceiverStream::new(rx).flatten())
Ok(resp_builder.body(body)?)
} }
} }
} }
@ -365,13 +431,15 @@ async fn handle_get_range(
version: &ObjectVersion, version: &ObjectVersion,
version_data: &ObjectVersionData, version_data: &ObjectVersionData,
version_meta: &ObjectVersionMeta, version_meta: &ObjectVersionMeta,
encryption: EncryptionParams,
headers: &ObjectVersionHeaders,
begin: u64, begin: u64,
end: u64, end: u64,
) -> Result<Response<ResBody>, Error> { ) -> Result<Response<ResBody>, Error> {
// Here we do not use getobject_override_headers because we don't // Here we do not use getobject_override_headers because we don't
// want to add any overridden headers (those should not be added // want to add any overridden headers (those should not be added
// when returning PARTIAL_CONTENT) // when returning PARTIAL_CONTENT)
let resp_builder = object_headers(version, version_meta) let resp_builder = object_headers(version, version_meta, headers, encryption)
.header(CONTENT_LENGTH, format!("{}", end - begin)) .header(CONTENT_LENGTH, format!("{}", end - begin))
.header( .header(
CONTENT_RANGE, CONTENT_RANGE,
@ -382,6 +450,7 @@ async fn handle_get_range(
match &version_data { match &version_data {
ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::DeleteMarker => unreachable!(),
ObjectVersionData::Inline(_meta, bytes) => { ObjectVersionData::Inline(_meta, bytes) => {
let bytes = encryption.decrypt_blob(&bytes)?;
if end as usize <= bytes.len() { if end as usize <= bytes.len() {
let body = bytes_body(bytes[begin as usize..end as usize].to_vec().into()); let body = bytes_body(bytes[begin as usize..end as usize].to_vec().into());
Ok(resp_builder.body(body)?) Ok(resp_builder.body(body)?)
@ -398,7 +467,8 @@ async fn handle_get_range(
.await? .await?
.ok_or(Error::NoSuchKey)?; .ok_or(Error::NoSuchKey)?;
let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); let body =
body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end);
Ok(resp_builder.body(body)?) Ok(resp_builder.body(body)?)
} }
} }
@ -409,17 +479,21 @@ async fn handle_get_part(
object_version: &ObjectVersion, object_version: &ObjectVersion,
version_data: &ObjectVersionData, version_data: &ObjectVersionData,
version_meta: &ObjectVersionMeta, version_meta: &ObjectVersionMeta,
encryption: EncryptionParams,
headers: &ObjectVersionHeaders,
part_number: u64, part_number: u64,
) -> Result<Response<ResBody>, Error> { ) -> Result<Response<ResBody>, Error> {
// Same as for get_range, no getobject_override_headers // Same as for get_range, no getobject_override_headers
let resp_builder = let resp_builder = object_headers(object_version, version_meta, headers, encryption)
object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); .status(StatusCode::PARTIAL_CONTENT);
match version_data { match version_data {
ObjectVersionData::Inline(_, bytes) => { ObjectVersionData::Inline(_, bytes) => {
if part_number != 1 { if part_number != 1 {
return Err(Error::InvalidPart); return Err(Error::InvalidPart);
} }
let bytes = encryption.decrypt_blob(&bytes)?;
assert_eq!(bytes.len() as u64, version_meta.size);
Ok(resp_builder Ok(resp_builder
.header(CONTENT_LENGTH, format!("{}", bytes.len())) .header(CONTENT_LENGTH, format!("{}", bytes.len()))
.header( .header(
@ -427,7 +501,7 @@ async fn handle_get_part(
format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()),
) )
.header(X_AMZ_MP_PARTS_COUNT, "1") .header(X_AMZ_MP_PARTS_COUNT, "1")
.body(bytes_body(bytes.to_vec().into()))?) .body(bytes_body(bytes.into_owned().into()))?)
} }
ObjectVersionData::FirstBlock(_, _) => { ObjectVersionData::FirstBlock(_, _) => {
let version = garage let version = garage
@ -439,7 +513,8 @@ async fn handle_get_part(
let (begin, end) = let (begin, end) =
calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?;
let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); let body =
body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end);
Ok(resp_builder Ok(resp_builder
.header(CONTENT_LENGTH, format!("{}", end - begin)) .header(CONTENT_LENGTH, format!("{}", end - begin))
@ -494,6 +569,7 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> {
fn body_from_blocks_range( fn body_from_blocks_range(
garage: Arc<Garage>, garage: Arc<Garage>,
encryption: EncryptionParams,
all_blocks: &[(VersionBlockKey, VersionBlock)], all_blocks: &[(VersionBlockKey, VersionBlock)],
begin: u64, begin: u64,
end: u64, end: u64,
@ -523,12 +599,11 @@ fn body_from_blocks_range(
tokio::spawn(async move { tokio::spawn(async move {
match async { match async {
let garage = garage.clone();
for (i, (block, block_offset)) in blocks.iter().enumerate() { for (i, (block, block_offset)) in blocks.iter().enumerate() {
let block_stream = garage let block_stream = encryption
.block_manager .get_block(&garage, &block.hash, Some(order_stream.order(i as u64)))
.rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64))) .await?;
.await? let block_stream = block_stream
.scan(*block_offset, move |chunk_offset, chunk| { .scan(*block_offset, move |chunk_offset, chunk| {
let r = match chunk { let r = match chunk {
Ok(chunk_bytes) => { Ok(chunk_bytes) => {
@ -588,19 +663,30 @@ fn body_from_blocks_range(
} }
fn response_body_from_block_stream(rx: mpsc::Receiver<ByteStream>) -> ResBody { fn response_body_from_block_stream(rx: mpsc::Receiver<ByteStream>) -> ResBody {
let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx) let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten();
.flatten() response_body_from_stream(body_stream)
.map(|x| { }
x.map(hyper::body::Frame::data)
.map_err(|e| Error::from(garage_util::error::Error::from(e))) fn response_body_from_stream<S>(stream: S) -> ResBody
}); where
S: Stream<Item = Result<Bytes, std::io::Error>> + Send + Sync + 'static,
{
let body_stream = stream.map(|x| {
x.map(hyper::body::Frame::data)
.map_err(|e| Error::from(garage_util::error::Error::from(e)))
});
ResBody::new(http_body_util::StreamBody::new(body_stream)) ResBody::new(http_body_util::StreamBody::new(body_stream))
} }
fn error_stream_item<E: std::fmt::Display>(e: E) -> ByteStream { fn error_stream_item<E: std::fmt::Display>(e: E) -> ByteStream {
let err = std::io::Error::new( Box::pin(stream::once(future::ready(Err(std_error_from_read_error(
std::io::ErrorKind::Other, e,
format!("Error while getting object data: {}", e), )))))
); }
Box::pin(stream::once(future::ready(Err(err))))
fn std_error_from_read_error<E: std::fmt::Display>(e: E) -> std::io::Error {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Error while reading object data: {}", e),
)
} }

View file

@ -944,9 +944,8 @@ mod tests {
timestamp: TS, timestamp: TS,
state: ObjectVersionState::Uploading { state: ObjectVersionState::Uploading {
multipart: true, multipart: true,
headers: ObjectVersionHeaders { encryption: ObjectVersionEncryption::Plaintext {
content_type: "text/plain".to_string(), headers: ObjectVersionHeaders(vec![]),
other: BTreeMap::<String, String>::new(),
}, },
}, },
} }

View file

@ -13,5 +13,6 @@ mod post_object;
mod put; mod put;
mod website; mod website;
mod encryption;
mod router; mod router;
pub mod xml; pub mod xml;

View file

@ -16,6 +16,7 @@ use garage_model::s3::version_table::*;
use crate::helpers::*; use crate::helpers::*;
use crate::s3::api_server::{ReqBody, ResBody}; use crate::s3::api_server::{ReqBody, ResBody};
use crate::s3::encryption::EncryptionParams;
use crate::s3::error::*; use crate::s3::error::*;
use crate::s3::put::*; use crate::s3::put::*;
use crate::s3::xml as s3_xml; use crate::s3::xml as s3_xml;
@ -41,13 +42,17 @@ pub async fn handle_create_multipart_upload(
let headers = get_headers(req.headers())?; let headers = get_headers(req.headers())?;
// Determine whether object should be encrypted, and if so the key
let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?;
let object_encryption = encryption.encrypt_headers(headers)?;
// Create object in object table // Create object in object table
let object_version = ObjectVersion { let object_version = ObjectVersion {
uuid: upload_id, uuid: upload_id,
timestamp, timestamp,
state: ObjectVersionState::Uploading { state: ObjectVersionState::Uploading {
multipart: true, multipart: true,
headers, encryption: object_encryption,
}, },
}; };
let object = Object::new(*bucket_id, key.to_string(), vec![object_version]); let object = Object::new(*bucket_id, key.to_string(), vec![object_version]);
@ -68,7 +73,9 @@ pub async fn handle_create_multipart_upload(
}; };
let xml = s3_xml::to_xml_with_header(&result)?; let xml = s3_xml::to_xml_with_header(&result)?;
Ok(Response::new(string_body(xml))) let mut resp = Response::builder();
encryption.add_response_headers(&mut resp);
Ok(resp.body(string_body(xml))?)
} }
pub async fn handle_put_part( pub async fn handle_put_part(
@ -91,12 +98,21 @@ pub async fn handle_put_part(
// Read first chuck, and at the same time try to get object to see if it exists // Read first chuck, and at the same time try to get object to see if it exists
let key = key.to_string(); let key = key.to_string();
let stream = body_stream(req.into_body()); let (req_head, req_body) = req.into_parts();
let stream = body_stream(req_body);
let mut chunker = StreamChunker::new(stream, garage.config.block_size); let mut chunker = StreamChunker::new(stream, garage.config.block_size);
let ((_, _, mut mpu), first_block) = let ((_, object_version, mut mpu), first_block) =
futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?; futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?;
// Check encryption params
let object_encryption = match object_version.state {
ObjectVersionState::Uploading { encryption, .. } => encryption,
_ => unreachable!(),
};
let (encryption, _) =
EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?;
// Check object is valid and part can be accepted // Check object is valid and part can be accepted
let first_block = first_block.ok_or_bad_request("Empty body")?; let first_block = first_block.ok_or_bad_request("Empty body")?;
@ -136,24 +152,32 @@ pub async fn handle_put_part(
garage.version_table.insert(&version).await?; garage.version_table.insert(&version).await?;
// Copy data to version // Copy data to version
let (total_size, data_md5sum, data_sha256sum, _) = let (total_size, data_md5sum, data_sha256sum, _) = read_and_put_blocks(
read_and_put_blocks(&ctx, &version, part_number, first_block, &mut chunker).await?; &ctx,
&version,
encryption,
part_number,
first_block,
&mut chunker,
)
.await?;
// Verify that checksums map // Verify that checksums map
ensure_checksum_matches( ensure_checksum_matches(
data_md5sum.as_slice(), &data_md5sum,
data_sha256sum, data_sha256sum,
content_md5.as_deref(), content_md5.as_deref(),
content_sha256, content_sha256,
)?; )?;
// Store part etag in version // Store part etag in version
let data_md5sum_hex = hex::encode(data_md5sum); let etag = encryption.etag_from_md5(&data_md5sum);
mpu.parts.put( mpu.parts.put(
mpu_part_key, mpu_part_key,
MpuPart { MpuPart {
version: version_uuid, version: version_uuid,
etag: Some(data_md5sum_hex.clone()), etag: Some(etag.clone()),
size: Some(total_size), size: Some(total_size),
}, },
); );
@ -163,11 +187,9 @@ pub async fn handle_put_part(
// We won't have to clean up on drop. // We won't have to clean up on drop.
interrupted_cleanup.cancel(); interrupted_cleanup.cancel();
let response = Response::builder() let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag));
.header("ETag", format!("\"{}\"", data_md5sum_hex)) encryption.add_response_headers(&mut resp);
.body(empty_body()) Ok(resp.body(empty_body())?)
.unwrap();
Ok(response)
} }
struct InterruptedCleanup(Option<InterruptedCleanupInner>); struct InterruptedCleanup(Option<InterruptedCleanupInner>);
@ -241,8 +263,8 @@ pub async fn handle_complete_multipart_upload(
return Err(Error::bad_request("No data was uploaded")); return Err(Error::bad_request("No data was uploaded"));
} }
let headers = match object_version.state { let object_encryption = match object_version.state {
ObjectVersionState::Uploading { headers, .. } => headers, ObjectVersionState::Uploading { encryption, .. } => encryption,
_ => unreachable!(), _ => unreachable!(),
}; };
@ -344,7 +366,7 @@ pub async fn handle_complete_multipart_upload(
// Write final object version // Write final object version
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
ObjectVersionMeta { ObjectVersionMeta {
headers, encryption: object_encryption,
size: total_size, size: total_size,
etag: etag.clone(), etag: etag.clone(),
}, },

View file

@ -18,6 +18,7 @@ use garage_model::garage::Garage;
use crate::helpers::*; use crate::helpers::*;
use crate::s3::api_server::ResBody; use crate::s3::api_server::ResBody;
use crate::s3::cors::*; use crate::s3::cors::*;
use crate::s3::encryption::EncryptionParams;
use crate::s3::error::*; use crate::s3::error::*;
use crate::s3::put::{get_headers, save_stream}; use crate::s3::put::{get_headers, save_stream};
use crate::s3::xml as s3_xml; use crate::s3::xml as s3_xml;
@ -48,13 +49,17 @@ pub async fn handle_post_object(
let mut multipart = Multipart::with_constraints(stream, boundary, constraints); let mut multipart = Multipart::with_constraints(stream, boundary, constraints);
let mut params = HeaderMap::new(); let mut params = HeaderMap::new();
let field = loop { let file_field = loop {
let field = if let Some(field) = multipart.next_field().await? { let field = if let Some(field) = multipart.next_field().await? {
field field
} else { } else {
return Err(Error::bad_request("Request did not contain a file")); return Err(Error::bad_request("Request did not contain a file"));
}; };
let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { let name: HeaderName = if let Some(Ok(name)) = field
.name()
.map(str::to_ascii_lowercase)
.map(TryInto::try_into)
{
name name
} else { } else {
continue; continue;
@ -93,10 +98,14 @@ pub async fn handle_post_object(
.ok_or_bad_request("No policy was provided")? .ok_or_bad_request("No policy was provided")?
.to_str()?; .to_str()?;
let authorization = Authorization::parse_form(&params)?; let authorization = Authorization::parse_form(&params)?;
let content_md5 = params
.get("content-md5")
.map(HeaderValue::to_str)
.transpose()?;
let key = if key.contains("${filename}") { let key = if key.contains("${filename}") {
// if no filename is provided, don't replace. This matches the behavior of AWS. // if no filename is provided, don't replace. This matches the behavior of AWS.
if let Some(filename) = field.file_name() { if let Some(filename) = file_field.file_name() {
key.replace("${filename}", filename) key.replace("${filename}", filename)
} else { } else {
key.to_owned() key.to_owned()
@ -143,9 +152,8 @@ pub async fn handle_post_object(
let mut conditions = decoded_policy.into_conditions()?; let mut conditions = decoded_policy.into_conditions()?;
for (param_key, value) in params.iter() { for (param_key, value) in params.iter() {
let mut param_key = param_key.to_string(); let param_key = param_key.as_str();
param_key.make_ascii_lowercase(); match param_key {
match param_key.as_str() {
"policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields
"content-type" => { "content-type" => {
let conds = conditions.params.remove("content-type").ok_or_else(|| { let conds = conditions.params.remove("content-type").ok_or_else(|| {
@ -190,7 +198,7 @@ pub async fn handle_post_object(
// how aws seems to behave. // how aws seems to behave.
continue; continue;
} }
let conds = conditions.params.remove(&param_key).ok_or_else(|| { let conds = conditions.params.remove(param_key).ok_or_else(|| {
Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
})?; })?;
for cond in conds { for cond in conds {
@ -218,8 +226,9 @@ pub async fn handle_post_object(
let headers = get_headers(&params)?; let headers = get_headers(&params)?;
let stream = field.map(|r| r.map_err(Into::into)); let encryption = EncryptionParams::new_from_headers(&garage, &params)?;
let stream = file_field.map(|r| r.map_err(Into::into));
let ctx = ReqCtx { let ctx = ReqCtx {
garage, garage,
bucket_id, bucket_id,
@ -228,17 +237,18 @@ pub async fn handle_post_object(
api_key, api_key,
}; };
let (_, md5) = save_stream( let res = save_stream(
&ctx, &ctx,
headers, headers,
encryption,
StreamLimiter::new(stream, conditions.content_length), StreamLimiter::new(stream, conditions.content_length),
&key, &key,
None, content_md5.map(str::to_string),
None, None,
) )
.await?; .await?;
let etag = format!("\"{}\"", md5); let etag = format!("\"{}\"", res.etag);
let mut resp = if let Some(mut target) = params let mut resp = if let Some(mut target) = params
.get("success_action_redirect") .get("success_action_redirect")
@ -252,11 +262,12 @@ pub async fn handle_post_object(
.append_pair("key", &key) .append_pair("key", &key)
.append_pair("etag", &etag); .append_pair("etag", &etag);
let target = target.to_string(); let target = target.to_string();
Response::builder() let mut resp = Response::builder()
.status(StatusCode::SEE_OTHER) .status(StatusCode::SEE_OTHER)
.header(header::LOCATION, target.clone()) .header(header::LOCATION, target.clone())
.header(header::ETAG, etag) .header(header::ETAG, etag);
.body(string_body(target))? encryption.add_response_headers(&mut resp);
resp.body(string_body(target))?
} else { } else {
let path = head let path = head
.uri .uri
@ -283,9 +294,10 @@ pub async fn handle_post_object(
.get("success_action_status") .get("success_action_status")
.and_then(|h| h.to_str().ok()) .and_then(|h| h.to_str().ok())
.unwrap_or("204"); .unwrap_or("204");
let builder = Response::builder() let mut builder = Response::builder()
.header(header::LOCATION, location.clone()) .header(header::LOCATION, location.clone())
.header(header::ETAG, etag.clone()); .header(header::ETAG, etag.clone());
encryption.add_response_headers(&mut builder);
match action { match action {
"200" => builder.status(StatusCode::OK).body(empty_body())?, "200" => builder.status(StatusCode::OK).body(empty_body())?,
"201" => { "201" => {

View file

@ -1,4 +1,4 @@
use std::collections::{BTreeMap, HashMap}; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use base64::prelude::*; use base64::prelude::*;
@ -36,10 +36,18 @@ use garage_model::s3::version_table::*;
use crate::helpers::*; use crate::helpers::*;
use crate::s3::api_server::{ReqBody, ResBody}; use crate::s3::api_server::{ReqBody, ResBody};
use crate::s3::encryption::EncryptionParams;
use crate::s3::error::*; use crate::s3::error::*;
const PUT_BLOCKS_MAX_PARALLEL: usize = 3; const PUT_BLOCKS_MAX_PARALLEL: usize = 3;
pub struct SaveStreamResult {
pub version_uuid: Uuid,
pub version_timestamp: u64,
/// Etag WITHOUT THE QUOTES (just the hex value)
pub etag: String,
}
pub async fn handle_put( pub async fn handle_put(
ctx: ReqCtx, ctx: ReqCtx,
req: Request<ReqBody>, req: Request<ReqBody>,
@ -50,6 +58,9 @@ pub async fn handle_put(
let headers = get_headers(req.headers())?; let headers = get_headers(req.headers())?;
debug!("Object headers: {:?}", headers); debug!("Object headers: {:?}", headers);
// Determine whether object should be encrypted, and if so the key
let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?;
let content_md5 = match req.headers().get("content-md5") { let content_md5 = match req.headers().get("content-md5") {
Some(x) => Some(x.to_str()?.to_string()), Some(x) => Some(x.to_str()?.to_string()),
None => None, None => None,
@ -57,19 +68,33 @@ pub async fn handle_put(
let stream = body_stream(req.into_body()); let stream = body_stream(req.into_body());
save_stream(&ctx, headers, stream, key, content_md5, content_sha256) let res = save_stream(
.await &ctx,
.map(|(uuid, md5)| put_response(uuid, md5)) headers,
encryption,
stream,
key,
content_md5,
content_sha256,
)
.await?;
let mut resp = Response::builder()
.header("x-amz-version-id", hex::encode(res.version_uuid))
.header("ETag", format!("\"{}\"", res.etag));
encryption.add_response_headers(&mut resp);
Ok(resp.body(empty_body())?)
} }
pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
ctx: &ReqCtx, ctx: &ReqCtx,
headers: ObjectVersionHeaders, headers: ObjectVersionHeaders,
encryption: EncryptionParams,
body: S, body: S,
key: &String, key: &String,
content_md5: Option<String>, content_md5: Option<String>,
content_sha256: Option<FixedBytes32>, content_sha256: Option<FixedBytes32>,
) -> Result<(Uuid, String), Error> { ) -> Result<SaveStreamResult, Error> {
let ReqCtx { let ReqCtx {
garage, bucket_id, .. garage, bucket_id, ..
} = ctx; } = ctx;
@ -82,6 +107,8 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
let first_block = first_block_opt.unwrap_or_default(); let first_block = first_block_opt.unwrap_or_default();
let object_encryption = encryption.encrypt_headers(headers)?;
// Generate identity of new version // Generate identity of new version
let version_uuid = gen_uuid(); let version_uuid = gen_uuid();
let version_timestamp = next_timestamp(existing_object.as_ref()); let version_timestamp = next_timestamp(existing_object.as_ref());
@ -92,37 +119,43 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
let mut md5sum = Md5::new(); let mut md5sum = Md5::new();
md5sum.update(&first_block[..]); md5sum.update(&first_block[..]);
let data_md5sum = md5sum.finalize(); let data_md5sum = md5sum.finalize();
let data_md5sum_hex = hex::encode(data_md5sum);
let data_sha256sum = sha256sum(&first_block[..]); let data_sha256sum = sha256sum(&first_block[..]);
let size = first_block.len() as u64;
ensure_checksum_matches( ensure_checksum_matches(
data_md5sum.as_slice(), &data_md5sum,
data_sha256sum, data_sha256sum,
content_md5.as_deref(), content_md5.as_deref(),
content_sha256, content_sha256,
)?; )?;
let size = first_block.len() as u64;
check_quotas(ctx, size, existing_object.as_ref()).await?; check_quotas(ctx, size, existing_object.as_ref()).await?;
let etag = encryption.etag_from_md5(&data_md5sum);
let inline_data = encryption.encrypt_blob(&first_block)?.to_vec();
let object_version = ObjectVersion { let object_version = ObjectVersion {
uuid: version_uuid, uuid: version_uuid,
timestamp: version_timestamp, timestamp: version_timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::Inline( state: ObjectVersionState::Complete(ObjectVersionData::Inline(
ObjectVersionMeta { ObjectVersionMeta {
headers, encryption: object_encryption,
size, size,
etag: data_md5sum_hex.clone(), etag: etag.clone(),
}, },
first_block.to_vec(), inline_data,
)), )),
}; };
let object = Object::new(*bucket_id, key.into(), vec![object_version]); let object = Object::new(*bucket_id, key.into(), vec![object_version]);
garage.object_table.insert(&object).await?; garage.object_table.insert(&object).await?;
return Ok((version_uuid, data_md5sum_hex)); return Ok(SaveStreamResult {
version_uuid,
version_timestamp,
etag,
});
} }
// The following consists in many steps that can each fail. // The following consists in many steps that can each fail.
@ -142,7 +175,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
uuid: version_uuid, uuid: version_uuid,
timestamp: version_timestamp, timestamp: version_timestamp,
state: ObjectVersionState::Uploading { state: ObjectVersionState::Uploading {
headers: headers.clone(), encryption: object_encryption.clone(),
multipart: false, multipart: false,
}, },
}; };
@ -165,10 +198,10 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
// Transfer data and verify checksum // Transfer data and verify checksum
let (total_size, data_md5sum, data_sha256sum, first_block_hash) = let (total_size, data_md5sum, data_sha256sum, first_block_hash) =
read_and_put_blocks(ctx, &version, 1, first_block, &mut chunker).await?; read_and_put_blocks(ctx, &version, encryption, 1, first_block, &mut chunker).await?;
ensure_checksum_matches( ensure_checksum_matches(
data_md5sum.as_slice(), &data_md5sum,
data_sha256sum, data_sha256sum,
content_md5.as_deref(), content_md5.as_deref(),
content_sha256, content_sha256,
@ -177,12 +210,13 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
check_quotas(ctx, total_size, existing_object.as_ref()).await?; check_quotas(ctx, total_size, existing_object.as_ref()).await?;
// Save final object state, marked as Complete // Save final object state, marked as Complete
let md5sum_hex = hex::encode(data_md5sum); let etag = encryption.etag_from_md5(&data_md5sum);
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
ObjectVersionMeta { ObjectVersionMeta {
headers, encryption: object_encryption,
size: total_size, size: total_size,
etag: md5sum_hex.clone(), etag: etag.clone(),
}, },
first_block_hash, first_block_hash,
)); ));
@ -193,7 +227,11 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
// We won't have to clean up on drop. // We won't have to clean up on drop.
interrupted_cleanup.cancel(); interrupted_cleanup.cancel();
Ok((version_uuid, md5sum_hex)) Ok(SaveStreamResult {
version_uuid,
version_timestamp,
etag,
})
} }
/// Validate MD5 sum against content-md5 header /// Validate MD5 sum against content-md5 header
@ -248,7 +286,7 @@ pub(crate) async fn check_quotas(
.await?; .await?;
let counters = counters let counters = counters
.map(|x| x.filtered_values(&garage.system.ring.borrow())) .map(|x| x.filtered_values(&garage.system.cluster_layout()))
.unwrap_or_default(); .unwrap_or_default();
let (prev_cnt_obj, prev_cnt_size) = match prev_object { let (prev_cnt_obj, prev_cnt_size) = match prev_object {
@ -290,6 +328,7 @@ pub(crate) async fn check_quotas(
pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>( pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
ctx: &ReqCtx, ctx: &ReqCtx,
version: &Version, version: &Version,
encryption: EncryptionParams,
part_number: u64, part_number: u64,
first_block: Bytes, first_block: Bytes,
chunker: &mut StreamChunker<S>, chunker: &mut StreamChunker<S>,
@ -349,12 +388,31 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
)) ))
}; };
let (block_tx3, mut block_rx3) = mpsc::channel::<Result<(Bytes, Hash), Error>>(1); let (block_tx3, mut block_rx3) = mpsc::channel::<Result<(Bytes, u64, Hash), Error>>(1);
let hash_blocks = async { let encrypt_hash_blocks = async {
let mut first_block_hash = None; let mut first_block_hash = None;
while let Some(next) = block_rx2.recv().await { while let Some(next) = block_rx2.recv().await {
match next { match next {
Ok(block) => { Ok(block) => {
let unencrypted_len = block.len() as u64;
let block = if encryption.is_encrypted() {
let res =
tokio::task::spawn_blocking(move || encryption.encrypt_block(block))
.with_context(Context::current_with_span(
tracer.start("Encrypt block"),
))
.await
.unwrap();
match res {
Ok(b) => b,
Err(e) => {
block_tx3.send(Err(e)).await?;
break;
}
}
} else {
block
};
let hash = async_blake2sum(block.clone()) let hash = async_blake2sum(block.clone())
.with_context(Context::current_with_span( .with_context(Context::current_with_span(
tracer.start("Hash block (blake2)"), tracer.start("Hash block (blake2)"),
@ -363,7 +421,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
if first_block_hash.is_none() { if first_block_hash.is_none() {
first_block_hash = Some(hash); first_block_hash = Some(hash);
} }
block_tx3.send(Ok((block, hash))).await?; block_tx3.send(Ok((block, unencrypted_len, hash))).await?;
} }
Err(e) => { Err(e) => {
block_tx3.send(Err(e)).await?; block_tx3.send(Err(e)).await?;
@ -398,7 +456,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
block_rx3.recv().await block_rx3.recv().await
} }
}; };
let (block, hash) = tokio::select! { let (block, unencrypted_len, hash) = tokio::select! {
result = write_futs_next => { result = write_futs_next => {
result?; result?;
continue; continue;
@ -410,17 +468,18 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
}; };
// For next block to be written: count its size and spawn future to write it // For next block to be written: count its size and spawn future to write it
let offset = written_bytes;
written_bytes += block.len() as u64;
write_futs.push_back(put_block_and_meta( write_futs.push_back(put_block_and_meta(
ctx, ctx,
version, version,
part_number, part_number,
offset, written_bytes,
hash, hash,
block, block,
unencrypted_len,
encryption.is_encrypted(),
order_stream.order(written_bytes), order_stream.order(written_bytes),
)); ));
written_bytes += unencrypted_len;
} }
while let Some(res) = write_futs.next().await { while let Some(res) = write_futs.next().await {
res?; res?;
@ -429,7 +488,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
}; };
let (_, stream_hash_result, block_hash_result, final_result) = let (_, stream_hash_result, block_hash_result, final_result) =
futures::join!(read_blocks, hash_stream, hash_blocks, put_blocks); futures::join!(read_blocks, hash_stream, encrypt_hash_blocks, put_blocks);
let total_size = final_result?; let total_size = final_result?;
// unwrap here is ok, because if hasher failed, it is because something failed // unwrap here is ok, because if hasher failed, it is because something failed
@ -449,6 +508,8 @@ async fn put_block_and_meta(
offset: u64, offset: u64,
hash: Hash, hash: Hash,
block: Bytes, block: Bytes,
size: u64,
is_encrypted: bool,
order_tag: OrderTag, order_tag: OrderTag,
) -> Result<(), GarageError> { ) -> Result<(), GarageError> {
let ReqCtx { garage, .. } = ctx; let ReqCtx { garage, .. } = ctx;
@ -459,10 +520,7 @@ async fn put_block_and_meta(
part_number, part_number,
offset, offset,
}, },
VersionBlock { VersionBlock { hash, size },
hash,
size: block.len() as u64,
},
); );
let block_ref = BlockRef { let block_ref = BlockRef {
@ -474,7 +532,7 @@ async fn put_block_and_meta(
futures::try_join!( futures::try_join!(
garage garage
.block_manager .block_manager
.rpc_put_block(hash, block, Some(order_tag)), .rpc_put_block(hash, block, is_encrypted, Some(order_tag)),
garage.version_table.insert(&version), garage.version_table.insert(&version),
garage.block_ref_table.insert(&block_ref), garage.block_ref_table.insert(&block_ref),
)?; )?;
@ -517,14 +575,6 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
} }
} }
pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response<ResBody> {
Response::builder()
.header("x-amz-version-id", hex::encode(version_uuid))
.header("ETag", format!("\"{}\"", md5sum_hex))
.body(empty_body())
.unwrap()
}
struct InterruptedCleanup(Option<InterruptedCleanupInner>); struct InterruptedCleanup(Option<InterruptedCleanupInner>);
struct InterruptedCleanupInner { struct InterruptedCleanupInner {
garage: Arc<Garage>, garage: Arc<Garage>,
@ -559,57 +609,35 @@ impl Drop for InterruptedCleanup {
// ============ helpers ============ // ============ helpers ============
pub(crate) fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> {
Ok(headers
.get(hyper::header::CONTENT_TYPE)
.map(|x| x.to_str())
.unwrap_or(Ok("blob"))?
.to_string())
}
pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<ObjectVersionHeaders, Error> { pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<ObjectVersionHeaders, Error> {
let content_type = get_mime_type(headers)?; let mut ret = Vec::new();
let mut other = BTreeMap::new();
// Preserve standard headers // Preserve standard headers
let standard_header = vec![ let standard_header = vec![
hyper::header::CONTENT_TYPE,
hyper::header::CACHE_CONTROL, hyper::header::CACHE_CONTROL,
hyper::header::CONTENT_DISPOSITION, hyper::header::CONTENT_DISPOSITION,
hyper::header::CONTENT_ENCODING, hyper::header::CONTENT_ENCODING,
hyper::header::CONTENT_LANGUAGE, hyper::header::CONTENT_LANGUAGE,
hyper::header::EXPIRES, hyper::header::EXPIRES,
]; ];
for h in standard_header.iter() { for name in standard_header.iter() {
if let Some(v) = headers.get(h) { if let Some(value) = headers.get(name) {
match v.to_str() { ret.push((name.to_string(), value.to_str()?.to_string()));
Ok(v_str) => {
other.insert(h.to_string(), v_str.to_string());
}
Err(e) => {
warn!("Discarding header {}, error in .to_str(): {}", h, e);
}
}
} }
} }
// Preserve x-amz-meta- headers // Preserve x-amz-meta- headers
for (k, v) in headers.iter() { for (name, value) in headers.iter() {
if k.as_str().starts_with("x-amz-meta-") { if name.as_str().starts_with("x-amz-meta-") {
match std::str::from_utf8(v.as_bytes()) { ret.push((
Ok(v_str) => { name.to_string(),
other.insert(k.to_string(), v_str.to_string()); std::str::from_utf8(value.as_bytes())?.to_string(),
} ));
Err(e) => {
warn!("Discarding header {}, error in .to_str(): {}", k, e);
}
}
} }
} }
Ok(ObjectVersionHeaders { Ok(ObjectVersionHeaders(ret))
content_type,
other,
})
} }
pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 { pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 {

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_block" name = "garage_block"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"

View file

@ -96,7 +96,7 @@ impl DataBlock {
} }
} }
fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> { pub fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> {
let mut result = Vec::<u8>::new(); let mut result = Vec::<u8>::new();
let mut encoder = Encoder::new(&mut result, level)?; let mut encoder = Encoder::new(&mut result, level)?;
encoder.include_checksum(true)?; encoder.include_checksum(true)?;

View file

@ -9,3 +9,5 @@ mod block;
mod layout; mod layout;
mod metrics; mod metrics;
mod rc; mod rc;
pub use block::zstd_encode;

View file

@ -267,8 +267,10 @@ impl BlockManager {
F: Fn(DataBlockStream) -> Fut, F: Fn(DataBlockStream) -> Fut,
Fut: futures::Future<Output = Result<T, Error>>, Fut: futures::Future<Output = Result<T, Error>>,
{ {
let who = self.replication.read_nodes(hash); let who = self
let who = self.system.rpc.request_order(&who); .system
.rpc_helper()
.block_read_nodes_of(hash, self.system.rpc_helper());
for node in who.iter() { for node in who.iter() {
let node_id = NodeID::from(*node); let node_id = NodeID::from(*node);
@ -308,7 +310,7 @@ impl BlockManager {
// if the first one doesn't succeed rapidly // if the first one doesn't succeed rapidly
// TODO: keep first request running when initiating a new one and take the // TODO: keep first request running when initiating a new one and take the
// one that finishes earlier // one that finishes earlier
_ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => { _ = tokio::time::sleep(self.system.rpc_helper().rpc_timeout()) => {
debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node); debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node);
} }
}; };
@ -341,26 +343,18 @@ impl BlockManager {
} }
} }
/// Ask nodes that might have a block for it, return it as one big Bytes
pub async fn rpc_get_block(
&self,
hash: &Hash,
order_tag: Option<OrderTag>,
) -> Result<Bytes, Error> {
let stream = self.rpc_get_block_streaming(hash, order_tag).await?;
Ok(read_stream_to_end(stream).await?.into_bytes())
}
/// Send block to nodes that should have it /// Send block to nodes that should have it
pub async fn rpc_put_block( pub async fn rpc_put_block(
&self, &self,
hash: Hash, hash: Hash,
data: Bytes, data: Bytes,
prevent_compression: bool,
order_tag: Option<OrderTag>, order_tag: Option<OrderTag>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let who = self.replication.write_nodes(&hash); let who = self.replication.write_sets(&hash);
let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) let compression_level = self.compression_level.filter(|_| !prevent_compression);
let (header, bytes) = DataBlock::from_buffer(data, compression_level)
.await .await
.into_parts(); .into_parts();
let put_block_rpc = let put_block_rpc =
@ -372,10 +366,10 @@ impl BlockManager {
}; };
self.system self.system
.rpc .rpc_helper()
.try_call_many( .try_write_many_sets(
&self.endpoint, &self.endpoint,
&who[..], who.as_ref(),
put_block_rpc, put_block_rpc,
RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
.with_quorum(self.replication.write_quorum()), .with_quorum(self.replication.write_quorum()),
@ -390,11 +384,6 @@ impl BlockManager {
Ok(self.rc.rc.len()?) Ok(self.rc.rc.len()?)
} }
/// Get number of items in the refcount table
pub fn rc_fast_len(&self) -> Result<Option<usize>, Error> {
Ok(self.rc.rc.fast_len()?)
}
/// Send command to start/stop/manager scrub worker /// Send command to start/stop/manager scrub worker
pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) -> Result<(), Error> { pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) -> Result<(), Error> {
let tx = self.tx_scrub_command.load(); let tx = self.tx_scrub_command.load();
@ -410,7 +399,7 @@ impl BlockManager {
/// List all resync errors /// List all resync errors
pub fn list_resync_errors(&self) -> Result<Vec<BlockResyncErrorInfo>, Error> { pub fn list_resync_errors(&self) -> Result<Vec<BlockResyncErrorInfo>, Error> {
let mut blocks = Vec::with_capacity(self.resync.errors.len()); let mut blocks = Vec::with_capacity(self.resync.errors.len()?);
for ent in self.resync.errors.iter()? { for ent in self.resync.errors.iter()? {
let (hash, cnt) = ent?; let (hash, cnt) = ent?;
let cnt = ErrorCounter::decode(&cnt); let cnt = ErrorCounter::decode(&cnt);
@ -448,7 +437,7 @@ impl BlockManager {
tokio::spawn(async move { tokio::spawn(async move {
if let Err(e) = this if let Err(e) = this
.resync .resync
.put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout()) .put_to_resync(&hash, 2 * this.system.rpc_helper().rpc_timeout())
{ {
error!("Block {:?} could not be put in resync queue: {}.", hash, e); error!("Block {:?} could not be put in resync queue: {}.", hash, e);
} }
@ -542,7 +531,7 @@ impl BlockManager {
None => { None => {
// Not found but maybe we should have had it ?? // Not found but maybe we should have had it ??
self.resync self.resync
.put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?; .put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
return Err(Error::Message(format!( return Err(Error::Message(format!(
"block {:?} not found on node", "block {:?} not found on node",
hash hash

View file

@ -1,7 +1,6 @@
use opentelemetry::{global, metrics::*}; use opentelemetry::{global, metrics::*};
use garage_db as db; use garage_db as db;
use garage_db::counted_tree_hack::CountedTree;
/// TableMetrics reference all counter used for metrics /// TableMetrics reference all counter used for metrics
pub struct BlockManagerMetrics { pub struct BlockManagerMetrics {
@ -29,8 +28,8 @@ impl BlockManagerMetrics {
pub fn new( pub fn new(
compression_level: Option<i32>, compression_level: Option<i32>,
rc_tree: db::Tree, rc_tree: db::Tree,
resync_queue: CountedTree, resync_queue: db::Tree,
resync_errors: CountedTree, resync_errors: db::Tree,
) -> Self { ) -> Self {
let meter = global::meter("garage_model/block"); let meter = global::meter("garage_model/block");
Self { Self {
@ -45,15 +44,17 @@ impl BlockManagerMetrics {
.init(), .init(),
_rc_size: meter _rc_size: meter
.u64_value_observer("block.rc_size", move |observer| { .u64_value_observer("block.rc_size", move |observer| {
if let Ok(Some(v)) = rc_tree.fast_len() { if let Ok(value) = rc_tree.len() {
observer.observe(v as u64, &[]) observer.observe(value as u64, &[])
} }
}) })
.with_description("Number of blocks known to the reference counter") .with_description("Number of blocks known to the reference counter")
.init(), .init(),
_resync_queue_len: meter _resync_queue_len: meter
.u64_value_observer("block.resync_queue_length", move |observer| { .u64_value_observer("block.resync_queue_length", move |observer| {
observer.observe(resync_queue.len() as u64, &[]) if let Ok(value) = resync_queue.len() {
observer.observe(value as u64, &[]);
}
}) })
.with_description( .with_description(
"Number of block hashes queued for local check and possible resync", "Number of block hashes queued for local check and possible resync",
@ -61,7 +62,9 @@ impl BlockManagerMetrics {
.init(), .init(),
_resync_errored_blocks: meter _resync_errored_blocks: meter
.u64_value_observer("block.resync_errored_blocks", move |observer| { .u64_value_observer("block.resync_errored_blocks", move |observer| {
observer.observe(resync_errors.len() as u64, &[]) if let Ok(value) = resync_errors.len() {
observer.observe(value as u64, &[]);
}
}) })
.with_description("Number of block hashes whose last resync resulted in an error") .with_description("Number of block hashes whose last resync resulted in an error")
.init(), .init(),

View file

@ -15,7 +15,6 @@ use opentelemetry::{
}; };
use garage_db as db; use garage_db as db;
use garage_db::counted_tree_hack::CountedTree;
use garage_util::background::*; use garage_util::background::*;
use garage_util::data::*; use garage_util::data::*;
@ -47,9 +46,9 @@ pub(crate) const MAX_RESYNC_WORKERS: usize = 8;
const INITIAL_RESYNC_TRANQUILITY: u32 = 2; const INITIAL_RESYNC_TRANQUILITY: u32 = 2;
pub struct BlockResyncManager { pub struct BlockResyncManager {
pub(crate) queue: CountedTree, pub(crate) queue: db::Tree,
pub(crate) notify: Arc<Notify>, pub(crate) notify: Arc<Notify>,
pub(crate) errors: CountedTree, pub(crate) errors: db::Tree,
busy_set: BusySet, busy_set: BusySet,
@ -90,12 +89,10 @@ impl BlockResyncManager {
let queue = db let queue = db
.open_tree("block_local_resync_queue") .open_tree("block_local_resync_queue")
.expect("Unable to open block_local_resync_queue tree"); .expect("Unable to open block_local_resync_queue tree");
let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue");
let errors = db let errors = db
.open_tree("block_local_resync_errors") .open_tree("block_local_resync_errors")
.expect("Unable to open block_local_resync_errors tree"); .expect("Unable to open block_local_resync_errors tree");
let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors");
let persister = PersisterShared::new(&system.metadata_dir, "resync_cfg"); let persister = PersisterShared::new(&system.metadata_dir, "resync_cfg");
@ -110,16 +107,12 @@ impl BlockResyncManager {
/// Get lenght of resync queue /// Get lenght of resync queue
pub fn queue_len(&self) -> Result<usize, Error> { pub fn queue_len(&self) -> Result<usize, Error> {
// This currently can't return an error because the CountedTree hack Ok(self.queue.len()?)
// doesn't error on .len(), but this will change when we remove the hack
// (hopefully someday!)
Ok(self.queue.len())
} }
/// Get number of blocks that have an error /// Get number of blocks that have an error
pub fn errors_len(&self) -> Result<usize, Error> { pub fn errors_len(&self) -> Result<usize, Error> {
// (see queue_len comment) Ok(self.errors.len()?)
Ok(self.errors.len())
} }
/// Clear the error counter for a block and put it in queue immediately /// Clear the error counter for a block and put it in queue immediately
@ -180,7 +173,7 @@ impl BlockResyncManager {
// deleted once the garbage collection delay has passed. // deleted once the garbage collection delay has passed.
// //
// Here are some explanations on how the resync queue works. // Here are some explanations on how the resync queue works.
// There are two Sled trees that are used to have information // There are two db trees that are used to have information
// about the status of blocks that need to be resynchronized: // about the status of blocks that need to be resynchronized:
// //
// - resync.queue: a tree that is ordered first by a timestamp // - resync.queue: a tree that is ordered first by a timestamp
@ -377,7 +370,7 @@ impl BlockResyncManager {
info!("Resync block {:?}: offloading and deleting", hash); info!("Resync block {:?}: offloading and deleting", hash);
let existing_path = existing_path.unwrap(); let existing_path = existing_path.unwrap();
let mut who = manager.replication.write_nodes(hash); let mut who = manager.replication.storage_nodes(hash);
if who.len() < manager.replication.write_quorum() { if who.len() < manager.replication.write_quorum() {
return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string())); return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
} }
@ -385,7 +378,7 @@ impl BlockResyncManager {
let who_needs_resps = manager let who_needs_resps = manager
.system .system
.rpc .rpc_helper()
.call_many( .call_many(
&manager.endpoint, &manager.endpoint,
&who, &who,
@ -431,10 +424,10 @@ impl BlockResyncManager {
.with_stream_from_buffer(bytes); .with_stream_from_buffer(bytes);
manager manager
.system .system
.rpc .rpc_helper()
.try_call_many( .try_call_many(
&manager.endpoint, &manager.endpoint,
&need_nodes[..], &need_nodes,
put_block_message, put_block_message,
RequestStrategy::with_priority(PRIO_BACKGROUND) RequestStrategy::with_priority(PRIO_BACKGROUND)
.with_quorum(need_nodes.len()), .with_quorum(need_nodes.len()),
@ -541,9 +534,9 @@ impl Worker for ResyncWorker {
Ok(WorkerState::Idle) Ok(WorkerState::Idle)
} }
Err(e) => { Err(e) => {
// The errors that we have here are only Sled errors // The errors that we have here are only db errors
// We don't really know how to handle them so just ¯\_(ツ)_/¯ // We don't really know how to handle them so just ¯\_(ツ)_/¯
// (there is kind of an assumption that Sled won't error on us, // (there is kind of an assumption that the db won't error on us,
// if it does there is not much we can do -- TODO should we just panic?) // if it does there is not much we can do -- TODO should we just panic?)
// Here we just give the error to the worker manager, // Here we just give the error to the worker manager,
// it will print it to the logs and increment a counter // it will print it to the logs and increment a counter

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_db" name = "garage_db"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -20,13 +20,12 @@ heed = { workspace = true, optional = true }
rusqlite = { workspace = true, optional = true, features = ["backup"] } rusqlite = { workspace = true, optional = true, features = ["backup"] }
r2d2 = { workspace = true, optional = true } r2d2 = { workspace = true, optional = true }
r2d2_sqlite = { workspace = true, optional = true } r2d2_sqlite = { workspace = true, optional = true }
sled = { workspace = true, optional = true }
[dev-dependencies] [dev-dependencies]
mktemp.workspace = true mktemp.workspace = true
[features] [features]
default = [ "sled", "lmdb", "sqlite" ] default = [ "lmdb", "sqlite" ]
bundled-libs = [ "rusqlite?/bundled" ] bundled-libs = [ "rusqlite?/bundled" ]
lmdb = [ "heed" ] lmdb = [ "heed" ]
sqlite = [ "rusqlite", "r2d2", "r2d2_sqlite" ] sqlite = [ "rusqlite", "r2d2", "r2d2_sqlite" ]

View file

@ -1,127 +0,0 @@
//! This hack allows a db tree to keep in RAM a counter of the number of entries
//! it contains, which is used to call .len() on it. This is usefull only for
//! the sled backend where .len() otherwise would have to traverse the whole
//! tree to count items. For sqlite and lmdb, this is mostly useless (but
//! hopefully not harmfull!). Note that a CountedTree cannot be part of a
//! transaction.
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
use crate::{Result, Tree, TxError, Value, ValueIter};
#[derive(Clone)]
pub struct CountedTree(Arc<CountedTreeInternal>);
struct CountedTreeInternal {
tree: Tree,
len: AtomicUsize,
}
impl CountedTree {
pub fn new(tree: Tree) -> Result<Self> {
let len = tree.len()?;
Ok(Self(Arc::new(CountedTreeInternal {
tree,
len: AtomicUsize::new(len),
})))
}
pub fn len(&self) -> usize {
self.0.len.load(Ordering::SeqCst)
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
self.0.tree.get(key)
}
pub fn first(&self) -> Result<Option<(Value, Value)>> {
self.0.tree.first()
}
pub fn iter(&self) -> Result<ValueIter<'_>> {
self.0.tree.iter()
}
// ---- writing functions ----
pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<Value>>
where
K: AsRef<[u8]>,
V: AsRef<[u8]>,
{
let old_val = self.0.tree.insert(key, value)?;
if old_val.is_none() {
self.0.len.fetch_add(1, Ordering::SeqCst);
}
Ok(old_val)
}
pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
let old_val = self.0.tree.remove(key)?;
if old_val.is_some() {
self.0.len.fetch_sub(1, Ordering::SeqCst);
}
Ok(old_val)
}
pub fn compare_and_swap<K, OV, NV>(
&self,
key: K,
expected_old: Option<OV>,
new: Option<NV>,
) -> Result<bool>
where
K: AsRef<[u8]>,
OV: AsRef<[u8]>,
NV: AsRef<[u8]>,
{
let old_some = expected_old.is_some();
let new_some = new.is_some();
let tx_res = self.0.tree.db().transaction(|tx| {
let old_val = tx.get(&self.0.tree, &key)?;
let is_same = match (&old_val, &expected_old) {
(None, None) => true,
(Some(x), Some(y)) if x == y.as_ref() => true,
_ => false,
};
if is_same {
match &new {
Some(v) => {
tx.insert(&self.0.tree, &key, v)?;
}
None => {
tx.remove(&self.0.tree, &key)?;
}
}
Ok(())
} else {
Err(TxError::Abort(()))
}
});
match tx_res {
Ok(()) => {
match (old_some, new_some) {
(false, true) => {
self.0.len.fetch_add(1, Ordering::SeqCst);
}
(true, false) => {
self.0.len.fetch_sub(1, Ordering::SeqCst);
}
_ => (),
}
Ok(true)
}
Err(TxError::Abort(())) => Ok(false),
Err(TxError::Db(e)) => Err(e),
}
}
}

View file

@ -3,13 +3,9 @@ extern crate tracing;
#[cfg(feature = "lmdb")] #[cfg(feature = "lmdb")]
pub mod lmdb_adapter; pub mod lmdb_adapter;
#[cfg(feature = "sled")]
pub mod sled_adapter;
#[cfg(feature = "sqlite")] #[cfg(feature = "sqlite")]
pub mod sqlite_adapter; pub mod sqlite_adapter;
pub mod counted_tree_hack;
pub mod open; pub mod open;
#[cfg(test)] #[cfg(test)]
@ -62,6 +58,7 @@ pub type Result<T> = std::result::Result<T, Error>;
pub struct TxOpError(pub(crate) Error); pub struct TxOpError(pub(crate) Error);
pub type TxOpResult<T> = std::result::Result<T, TxOpError>; pub type TxOpResult<T> = std::result::Result<T, TxOpError>;
#[derive(Debug)]
pub enum TxError<E> { pub enum TxError<E> {
Abort(E), Abort(E),
Db(Error), Db(Error),
@ -200,10 +197,6 @@ impl Tree {
pub fn len(&self) -> Result<usize> { pub fn len(&self) -> Result<usize> {
self.0.len(self.1) self.0.len(self.1)
} }
#[inline]
pub fn fast_len(&self) -> Result<Option<usize>> {
self.0.fast_len(self.1)
}
#[inline] #[inline]
pub fn first(&self) -> Result<Option<(Value, Value)>> { pub fn first(&self) -> Result<Option<(Value, Value)>> {
@ -293,6 +286,11 @@ impl<'a> Transaction<'a> {
pub fn remove<T: AsRef<[u8]>>(&mut self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> { pub fn remove<T: AsRef<[u8]>>(&mut self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> {
self.tx.remove(tree.1, key.as_ref()) self.tx.remove(tree.1, key.as_ref())
} }
/// Clears all values in a tree
#[inline]
pub fn clear(&mut self, tree: &Tree) -> TxOpResult<()> {
self.tx.clear(tree.1)
}
#[inline] #[inline]
pub fn iter(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> { pub fn iter(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> {
@ -340,9 +338,6 @@ pub(crate) trait IDb: Send + Sync {
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>; fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
fn len(&self, tree: usize) -> Result<usize>; fn len(&self, tree: usize) -> Result<usize>;
fn fast_len(&self, _tree: usize) -> Result<Option<usize>> {
Ok(None)
}
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>>; fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>>;
fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>; fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
@ -373,6 +368,7 @@ pub(crate) trait ITx {
fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>>; fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>>;
fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>; fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>;
fn clear(&mut self, tree: usize) -> TxOpResult<()>;
fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>; fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>; fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;

View file

@ -4,6 +4,7 @@ use core::ptr::NonNull;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::path::PathBuf; use std::path::PathBuf;
use std::pin::Pin;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use heed::types::ByteSlice; use heed::types::ByteSlice;
@ -131,10 +132,6 @@ impl IDb for LmdbDb {
Ok(tree.len(&tx)?.try_into().unwrap()) Ok(tree.len(&tx)?.try_into().unwrap())
} }
fn fast_len(&self, tree: usize) -> Result<Option<usize>> {
Ok(Some(self.len(tree)?))
}
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> { fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let mut tx = self.db.write_txn()?; let mut tx = self.db.write_txn()?;
@ -252,8 +249,9 @@ impl<'a> ITx for LmdbTx<'a> {
None => Ok(None), None => Ok(None),
} }
} }
fn len(&self, _tree: usize) -> TxOpResult<usize> { fn len(&self, tree: usize) -> TxOpResult<usize> {
unimplemented!(".len() in transaction not supported with LMDB backend") let tree = self.get_tree(tree)?;
Ok(tree.len(&self.tx)? as usize)
} }
fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> { fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
@ -268,33 +266,48 @@ impl<'a> ITx for LmdbTx<'a> {
tree.delete(&mut self.tx, key)?; tree.delete(&mut self.tx, key)?;
Ok(old_val) Ok(old_val)
} }
fn clear(&mut self, tree: usize) -> TxOpResult<()> {
fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> { let tree = *self.get_tree(tree)?;
unimplemented!("Iterators in transactions not supported with LMDB backend"); tree.clear(&mut self.tx)?;
Ok(())
} }
fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with LMDB backend"); fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
let tree = *self.get_tree(tree)?;
Ok(Box::new(tree.iter(&self.tx)?.map(tx_iter_item)))
}
fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
let tree = *self.get_tree(tree)?;
Ok(Box::new(tree.rev_iter(&self.tx)?.map(tx_iter_item)))
} }
fn range<'r>( fn range<'r>(
&self, &self,
_tree: usize, tree: usize,
_low: Bound<&'r [u8]>, low: Bound<&'r [u8]>,
_high: Bound<&'r [u8]>, high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> { ) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with LMDB backend"); let tree = *self.get_tree(tree)?;
Ok(Box::new(
tree.range(&self.tx, &(low, high))?.map(tx_iter_item),
))
} }
fn range_rev<'r>( fn range_rev<'r>(
&self, &self,
_tree: usize, tree: usize,
_low: Bound<&'r [u8]>, low: Bound<&'r [u8]>,
_high: Bound<&'r [u8]>, high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> { ) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with LMDB backend"); let tree = *self.get_tree(tree)?;
Ok(Box::new(
tree.rev_range(&self.tx, &(low, high))?.map(tx_iter_item),
))
} }
} }
// ---- // ---- iterators outside transactions ----
// complicated, they must hold the transaction object
// therefore a bit of unsafe code (it is a self-referential struct)
type IteratorItem<'a> = heed::Result<( type IteratorItem<'a> = heed::Result<(
<ByteSlice as BytesDecode<'a>>::DItem, <ByteSlice as BytesDecode<'a>>::DItem,
@ -317,12 +330,20 @@ where
where where
F: FnOnce(&'a RoTxn<'a>) -> Result<I>, F: FnOnce(&'a RoTxn<'a>) -> Result<I>,
{ {
let mut res = TxAndIterator { tx, iter: None }; let res = TxAndIterator { tx, iter: None };
let mut boxed = Box::pin(res);
let tx = unsafe { NonNull::from(&res.tx).as_ref() }; // This unsafe allows us to bypass lifetime checks
res.iter = Some(iterfun(tx)?); let tx = unsafe { NonNull::from(&boxed.tx).as_ref() };
let iter = iterfun(tx)?;
Ok(Box::new(res)) let mut_ref = Pin::as_mut(&mut boxed);
// This unsafe allows us to write in a field of the pinned struct
unsafe {
Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
}
Ok(Box::new(TxAndIteratorPin(boxed)))
} }
} }
@ -331,18 +352,26 @@ where
I: Iterator<Item = IteratorItem<'a>> + 'a, I: Iterator<Item = IteratorItem<'a>> + 'a,
{ {
fn drop(&mut self) { fn drop(&mut self) {
// ensure the iterator is dropped before the RoTxn it references
drop(self.iter.take()); drop(self.iter.take());
} }
} }
impl<'a, I> Iterator for TxAndIterator<'a, I> struct TxAndIteratorPin<'a, I>(Pin<Box<TxAndIterator<'a, I>>>)
where
I: Iterator<Item = IteratorItem<'a>> + 'a;
impl<'a, I> Iterator for TxAndIteratorPin<'a, I>
where where
I: Iterator<Item = IteratorItem<'a>> + 'a, I: Iterator<Item = IteratorItem<'a>> + 'a,
{ {
type Item = Result<(Value, Value)>; type Item = Result<(Value, Value)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
match self.iter.as_mut().unwrap().next() { let mut_ref = Pin::as_mut(&mut self.0);
// This unsafe allows us to mutably access the iterator field
let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
match next {
None => None, None => None,
Some(Err(e)) => Some(Err(e.into())), Some(Err(e)) => Some(Err(e.into())),
Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))), Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))),
@ -350,7 +379,16 @@ where
} }
} }
// ---- // ---- iterators within transactions ----
fn tx_iter_item<'a>(
item: std::result::Result<(&'a [u8], &'a [u8]), heed::Error>,
) -> TxOpResult<(Vec<u8>, Vec<u8>)> {
item.map(|(k, v)| (k.to_vec(), v.to_vec()))
.map_err(|e| TxOpError(Error::from(e)))
}
// ---- utility ----
#[cfg(target_pointer_width = "64")] #[cfg(target_pointer_width = "64")]
pub fn recommended_map_size() -> usize { pub fn recommended_map_size() -> usize {

View file

@ -11,7 +11,6 @@ use crate::{Db, Error, Result};
pub enum Engine { pub enum Engine {
Lmdb, Lmdb,
Sqlite, Sqlite,
Sled,
} }
impl Engine { impl Engine {
@ -20,7 +19,6 @@ impl Engine {
match self { match self {
Self::Lmdb => "lmdb", Self::Lmdb => "lmdb",
Self::Sqlite => "sqlite", Self::Sqlite => "sqlite",
Self::Sled => "sled",
} }
} }
} }
@ -38,10 +36,10 @@ impl std::str::FromStr for Engine {
match text { match text {
"lmdb" | "heed" => Ok(Self::Lmdb), "lmdb" | "heed" => Ok(Self::Lmdb),
"sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite), "sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite),
"sled" => Ok(Self::Sled), "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.3).".into())),
kind => Err(Error( kind => Err(Error(
format!( format!(
"Invalid DB engine: {} (options are: lmdb, sled, sqlite)", "Invalid DB engine: {} (options are: lmdb, sqlite)",
kind kind
) )
.into(), .into(),
@ -53,8 +51,6 @@ impl std::str::FromStr for Engine {
pub struct OpenOpt { pub struct OpenOpt {
pub fsync: bool, pub fsync: bool,
pub lmdb_map_size: Option<usize>, pub lmdb_map_size: Option<usize>,
pub sled_cache_capacity: usize,
pub sled_flush_every_ms: u64,
} }
impl Default for OpenOpt { impl Default for OpenOpt {
@ -62,31 +58,12 @@ impl Default for OpenOpt {
Self { Self {
fsync: false, fsync: false,
lmdb_map_size: None, lmdb_map_size: None,
sled_cache_capacity: 1024 * 1024 * 1024,
sled_flush_every_ms: 2000,
} }
} }
} }
pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result<Db> { pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result<Db> {
match engine { match engine {
// ---- Sled DB ----
#[cfg(feature = "sled")]
Engine::Sled => {
if opt.fsync {
return Err(Error(
"`metadata_fsync = true` is not supported with the Sled database engine".into(),
));
}
info!("Opening Sled database at: {}", path.display());
let db = crate::sled_adapter::sled::Config::default()
.path(&path)
.cache_capacity(opt.sled_cache_capacity as u64)
.flush_every_ms(Some(opt.sled_flush_every_ms))
.open()?;
Ok(crate::sled_adapter::SledDb::init(db))
}
// ---- Sqlite DB ---- // ---- Sqlite DB ----
#[cfg(feature = "sqlite")] #[cfg(feature = "sqlite")]
Engine::Sqlite => { Engine::Sqlite => {

View file

@ -1,282 +0,0 @@
use core::ops::Bound;
use std::cell::Cell;
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use sled::transaction::{
ConflictableTransactionError, TransactionError, Transactional, TransactionalTree,
UnabortableTransactionError,
};
use crate::{
Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult,
TxResult, TxValueIter, Value, ValueIter,
};
pub use sled;
// -- err
impl From<sled::Error> for Error {
fn from(e: sled::Error) -> Error {
Error(format!("Sled: {}", e).into())
}
}
impl From<sled::Error> for TxOpError {
fn from(e: sled::Error) -> TxOpError {
TxOpError(e.into())
}
}
// -- db
pub struct SledDb {
db: sled::Db,
trees: RwLock<(Vec<sled::Tree>, HashMap<String, usize>)>,
}
impl SledDb {
#[deprecated(
since = "0.9.0",
note = "The Sled database is now deprecated and will be removed in Garage v1.0. Please migrate to LMDB or Sqlite as soon as possible."
)]
pub fn init(db: sled::Db) -> Db {
tracing::warn!("-------------------- IMPORTANT WARNING !!! ----------------------");
tracing::warn!("The Sled database is now deprecated and will be removed in Garage v1.0.");
tracing::warn!("Please migrate to LMDB or Sqlite as soon as possible.");
tracing::warn!("-----------------------------------------------------------------------");
let s = Self {
db,
trees: RwLock::new((Vec::new(), HashMap::new())),
};
Db(Arc::new(s))
}
fn get_tree(&self, i: usize) -> Result<sled::Tree> {
self.trees
.read()
.unwrap()
.0
.get(i)
.cloned()
.ok_or_else(|| Error("invalid tree id".into()))
}
}
impl IDb for SledDb {
fn engine(&self) -> String {
"Sled".into()
}
fn open_tree(&self, name: &str) -> Result<usize> {
let mut trees = self.trees.write().unwrap();
if let Some(i) = trees.1.get(name) {
Ok(*i)
} else {
let tree = self.db.open_tree(name)?;
let i = trees.0.len();
trees.0.push(tree);
trees.1.insert(name.to_string(), i);
Ok(i)
}
}
fn list_trees(&self) -> Result<Vec<String>> {
let mut trees = vec![];
for name in self.db.tree_names() {
let name = std::str::from_utf8(&name)
.map_err(|e| Error(format!("{}", e).into()))?
.to_string();
if name != "__sled__default" {
trees.push(name);
}
}
Ok(trees)
}
fn snapshot(&self, to: &PathBuf) -> Result<()> {
let to_db = sled::open(to)?;
let export = self.db.export();
to_db.import(export);
Ok(())
}
// ----
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
let tree = self.get_tree(tree)?;
let val = tree.get(key)?;
Ok(val.map(|x| x.to_vec()))
}
fn len(&self, tree: usize) -> Result<usize> {
let tree = self.get_tree(tree)?;
Ok(tree.len())
}
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
let tree = self.get_tree(tree)?;
let old_val = tree.insert(key, value)?;
Ok(old_val.map(|x| x.to_vec()))
}
fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
let tree = self.get_tree(tree)?;
let old_val = tree.remove(key)?;
Ok(old_val.map(|x| x.to_vec()))
}
fn clear(&self, tree: usize) -> Result<()> {
let tree = self.get_tree(tree)?;
tree.clear()?;
Ok(())
}
fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?;
Ok(Box::new(tree.iter().map(|v| {
v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
})))
}
fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?;
Ok(Box::new(tree.iter().rev().map(|v| {
v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
})))
}
fn range<'r>(
&self,
tree: usize,
low: Bound<&'r [u8]>,
high: Bound<&'r [u8]>,
) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?;
Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| {
v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
})))
}
fn range_rev<'r>(
&self,
tree: usize,
low: Bound<&'r [u8]>,
high: Bound<&'r [u8]>,
) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?;
Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map(
|v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into),
)))
}
// ----
fn transaction(&self, f: &dyn ITxFn) -> TxResult<OnCommit, ()> {
let trees = self.trees.read().unwrap();
let res = trees.0.transaction(|txtrees| {
let mut tx = SledTx {
trees: txtrees,
err: Cell::new(None),
};
match f.try_on(&mut tx) {
TxFnResult::Ok(on_commit) => {
assert!(tx.err.into_inner().is_none());
Ok(on_commit)
}
TxFnResult::Abort => {
assert!(tx.err.into_inner().is_none());
Err(ConflictableTransactionError::Abort(()))
}
TxFnResult::DbErr => {
let e = tx.err.into_inner().expect("No DB error");
Err(e.into())
}
}
});
match res {
Ok(on_commit) => Ok(on_commit),
Err(TransactionError::Abort(())) => Err(TxError::Abort(())),
Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())),
}
}
}
// ----
struct SledTx<'a> {
trees: &'a [TransactionalTree],
err: Cell<Option<UnabortableTransactionError>>,
}
impl<'a> SledTx<'a> {
fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> {
self.trees.get(i).ok_or_else(|| {
TxOpError(Error(
"invalid tree id (it might have been openned after the transaction started)".into(),
))
})
}
fn save_error<R>(
&self,
v: std::result::Result<R, UnabortableTransactionError>,
) -> TxOpResult<R> {
match v {
Ok(x) => Ok(x),
Err(e) => {
let txt = format!("{}", e);
self.err.set(Some(e));
Err(TxOpError(Error(txt.into())))
}
}
}
}
impl<'a> ITx for SledTx<'a> {
fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
let tree = self.get_tree(tree)?;
let tmp = self.save_error(tree.get(key))?;
Ok(tmp.map(|x| x.to_vec()))
}
fn len(&self, _tree: usize) -> TxOpResult<usize> {
unimplemented!(".len() in transaction not supported with Sled backend")
}
fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
let tree = self.get_tree(tree)?;
let old_val = self.save_error(tree.insert(key, value))?;
Ok(old_val.map(|x| x.to_vec()))
}
fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
let tree = self.get_tree(tree)?;
let old_val = self.save_error(tree.remove(key))?;
Ok(old_val.map(|x| x.to_vec()))
}
fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with Sled backend");
}
fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with Sled backend");
}
fn range<'r>(
&self,
_tree: usize,
_low: Bound<&'r [u8]>,
_high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with Sled backend");
}
fn range_rev<'r>(
&self,
_tree: usize,
_low: Bound<&'r [u8]>,
_high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> {
unimplemented!("Iterators in transactions not supported with Sled backend");
}
}

View file

@ -169,10 +169,6 @@ impl IDb for SqliteDb {
} }
} }
fn fast_len(&self, tree: usize) -> Result<Option<usize>> {
Ok(Some(self.len(tree)?))
}
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> { fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let db = self.db.get()?; let db = self.db.get()?;
@ -371,33 +367,64 @@ impl<'a> ITx for SqliteTx<'a> {
Ok(old_val) Ok(old_val)
} }
fn clear(&mut self, tree: usize) -> TxOpResult<()> {
fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> { let tree = self.get_tree(tree)?;
unimplemented!(); self.tx.execute(&format!("DELETE FROM {}", tree), [])?;
Ok(())
} }
fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
unimplemented!(); fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
let tree = self.get_tree(tree)?;
let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree);
TxValueIterator::make(self, &sql, [])
}
fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>> {
let tree = self.get_tree(tree)?;
let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree);
TxValueIterator::make(self, &sql, [])
} }
fn range<'r>( fn range<'r>(
&self, &self,
_tree: usize, tree: usize,
_low: Bound<&'r [u8]>, low: Bound<&'r [u8]>,
_high: Bound<&'r [u8]>, high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> { ) -> TxOpResult<TxValueIter<'_>> {
unimplemented!(); let tree = self.get_tree(tree)?;
let (bounds_sql, params) = bounds_sql(low, high);
let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql);
let params = params
.iter()
.map(|x| x as &dyn rusqlite::ToSql)
.collect::<Vec<_>>();
TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref())
} }
fn range_rev<'r>( fn range_rev<'r>(
&self, &self,
_tree: usize, tree: usize,
_low: Bound<&'r [u8]>, low: Bound<&'r [u8]>,
_high: Bound<&'r [u8]>, high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> { ) -> TxOpResult<TxValueIter<'_>> {
unimplemented!(); let tree = self.get_tree(tree)?;
let (bounds_sql, params) = bounds_sql(low, high);
let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql);
let params = params
.iter()
.map(|x| x as &dyn rusqlite::ToSql)
.collect::<Vec<_>>();
TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref())
} }
} }
// ---- // ---- iterators outside transactions ----
// complicated, they must hold the Statement and Row objects
// therefore quite some unsafe code (it is a self-referential struct)
struct DbValueIterator<'a> { struct DbValueIterator<'a> {
db: Connection, db: Connection,
@ -417,17 +444,23 @@ impl<'a> DbValueIterator<'a> {
let mut boxed = Box::pin(res); let mut boxed = Box::pin(res);
trace!("make iterator with sql: {}", sql); trace!("make iterator with sql: {}", sql);
// This unsafe allows us to bypass lifetime checks
let db = unsafe { NonNull::from(&boxed.db).as_ref() };
let stmt = db.prepare(sql)?;
let mut_ref = Pin::as_mut(&mut boxed);
// This unsafe allows us to write in a field of the pinned struct
unsafe { unsafe {
let db = NonNull::from(&boxed.db);
let stmt = db.as_ref().prepare(sql)?;
let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt); Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt);
}
let mut stmt = NonNull::from(&boxed.stmt); // This unsafe allows us to bypass lifetime checks
let iter = stmt.as_mut().as_mut().unwrap().query(args)?; let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() };
let iter = stmt.as_mut().unwrap().query(args)?;
let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); let mut_ref = Pin::as_mut(&mut boxed);
// This unsafe allows us to write in a field of the pinned struct
unsafe {
Pin::get_unchecked_mut(mut_ref).iter = Some(iter); Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
} }
@ -449,28 +482,73 @@ impl<'a> Iterator for DbValueIteratorPin<'a> {
type Item = Result<(Value, Value)>; type Item = Result<(Value, Value)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let next = unsafe { let mut_ref = Pin::as_mut(&mut self.0);
let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0); // This unsafe allows us to mutably access the iterator field
Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
}; iter_next_row(next)
let row = match next {
Err(e) => return Some(Err(e.into())),
Ok(None) => return None,
Ok(Some(r)) => r,
};
let k = match row.get::<_, Vec<u8>>(0) {
Err(e) => return Some(Err(e.into())),
Ok(x) => x,
};
let v = match row.get::<_, Vec<u8>>(1) {
Err(e) => return Some(Err(e.into())),
Ok(y) => y,
};
Some(Ok((k, v)))
} }
} }
// ---- // ---- iterators within transactions ----
// it's the same except we don't hold a mutex guard,
// only a Statement and a Rows object
struct TxValueIterator<'a> {
stmt: Statement<'a>,
iter: Option<Rows<'a>>,
_pin: PhantomPinned,
}
impl<'a> TxValueIterator<'a> {
fn make<P: rusqlite::Params>(
tx: &'a SqliteTx<'a>,
sql: &str,
args: P,
) -> TxOpResult<TxValueIter<'a>> {
let stmt = tx.tx.prepare(sql)?;
let res = TxValueIterator {
stmt,
iter: None,
_pin: PhantomPinned,
};
let mut boxed = Box::pin(res);
trace!("make iterator with sql: {}", sql);
// This unsafe allows us to bypass lifetime checks
let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() };
let iter = stmt.query(args)?;
let mut_ref = Pin::as_mut(&mut boxed);
// This unsafe allows us to write in a field of the pinned struct
unsafe {
Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
}
Ok(Box::new(TxValueIteratorPin(boxed)))
}
}
impl<'a> Drop for TxValueIterator<'a> {
fn drop(&mut self) {
trace!("drop iter");
drop(self.iter.take());
}
}
struct TxValueIteratorPin<'a>(Pin<Box<TxValueIterator<'a>>>);
impl<'a> Iterator for TxValueIteratorPin<'a> {
type Item = TxOpResult<(Value, Value)>;
fn next(&mut self) -> Option<Self::Item> {
let mut_ref = Pin::as_mut(&mut self.0);
// This unsafe allows us to mutably access the iterator field
let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
iter_next_row(next)
}
}
// ---- utility ----
fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<Vec<u8>>) { fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<Vec<u8>>) {
let mut sql = String::new(); let mut sql = String::new();
@ -510,3 +588,25 @@ fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<V
(sql, params) (sql, params)
} }
fn iter_next_row<E>(
next_row: rusqlite::Result<Option<&rusqlite::Row>>,
) -> Option<std::result::Result<(Value, Value), E>>
where
E: From<rusqlite::Error>,
{
let row = match next_row {
Err(e) => return Some(Err(e.into())),
Ok(None) => return None,
Ok(Some(r)) => r,
};
let k = match row.get::<_, Vec<u8>>(0) {
Err(e) => return Some(Err(e.into())),
Ok(x) => x,
};
let v = match row.get::<_, Vec<u8>>(1) {
Err(e) => return Some(Err(e.into())),
Ok(y) => y,
};
Some(Ok((k, v)))
}

View file

@ -10,8 +10,13 @@ fn test_suite(db: Db) {
let vb: &[u8] = &b"plip"[..]; let vb: &[u8] = &b"plip"[..];
let vc: &[u8] = &b"plup"[..]; let vc: &[u8] = &b"plup"[..];
// ---- test simple insert/delete ----
assert!(tree.insert(ka, va).unwrap().is_none()); assert!(tree.insert(ka, va).unwrap().is_none());
assert_eq!(tree.get(ka).unwrap().unwrap(), va); assert_eq!(tree.get(ka).unwrap().unwrap(), va);
assert_eq!(tree.len().unwrap(), 1);
// ---- test transaction logic ----
let res = db.transaction::<_, (), _>(|tx| { let res = db.transaction::<_, (), _>(|tx| {
assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va); assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va);
@ -37,6 +42,8 @@ fn test_suite(db: Db) {
assert!(matches!(res, Err(TxError::Abort(42)))); assert!(matches!(res, Err(TxError::Abort(42))));
assert_eq!(tree.get(ka).unwrap().unwrap(), vb); assert_eq!(tree.get(ka).unwrap().unwrap(), vb);
// ---- test iteration outside of transactions ----
let mut iter = tree.iter().unwrap(); let mut iter = tree.iter().unwrap();
let next = iter.next().unwrap().unwrap(); let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
@ -73,6 +80,48 @@ fn test_suite(db: Db) {
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
assert!(iter.next().is_none()); assert!(iter.next().is_none());
drop(iter); drop(iter);
// ---- test iteration within transactions ----
db.transaction::<_, (), _>(|tx| {
let mut iter = tx.iter(&tree).unwrap();
let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
assert!(iter.next().is_none());
Ok(())
})
.unwrap();
db.transaction::<_, (), _>(|tx| {
let mut iter = tx.range(&tree, kint..).unwrap();
let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
assert!(iter.next().is_none());
Ok(())
})
.unwrap();
db.transaction::<_, (), _>(|tx| {
let mut iter = tx.range_rev(&tree, ..kint).unwrap();
let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
assert!(iter.next().is_none());
Ok(())
})
.unwrap();
db.transaction::<_, (), _>(|tx| {
let mut iter = tx.iter_rev(&tree).unwrap();
let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
let next = iter.next().unwrap().unwrap();
assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
assert!(iter.next().is_none());
Ok(())
})
.unwrap();
} }
#[test] #[test]
@ -90,17 +139,6 @@ fn test_lmdb_db() {
drop(path); drop(path);
} }
#[test]
#[cfg(feature = "sled")]
fn test_sled_db() {
use crate::sled_adapter::SledDb;
let path = mktemp::Temp::new_dir().unwrap();
let db = SledDb::init(sled::open(path.to_path_buf()).unwrap());
test_suite(db);
drop(path);
}
#[test] #[test]
#[cfg(feature = "sqlite")] #[cfg(feature = "sqlite")]
fn test_sqlite_db() { fn test_sqlite_db() {

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage" name = "garage"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -80,12 +80,11 @@ k2v-client.workspace = true
[features] [features]
default = [ "bundled-libs", "metrics", "sled", "lmdb", "sqlite", "k2v" ] default = [ "bundled-libs", "metrics", "lmdb", "sqlite", "k2v" ]
k2v = [ "garage_util/k2v", "garage_api/k2v" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ]
# Database engines, Sled is still our default even though we don't like it # Database engines
sled = [ "garage_model/sled" ]
lmdb = [ "garage_model/lmdb" ] lmdb = [ "garage_model/lmdb" ]
sqlite = [ "garage_model/sqlite" ] sqlite = [ "garage_model/sqlite" ]

View file

@ -70,7 +70,7 @@ impl AdminRpcHandler {
.table .table
.get(&bucket_id, &EmptyKey) .get(&bucket_id, &EmptyKey)
.await? .await?
.map(|x| x.filtered_values(&self.garage.system.ring.borrow())) .map(|x| x.filtered_values(&self.garage.system.cluster_layout()))
.unwrap_or_default(); .unwrap_or_default();
let mpu_counters = self let mpu_counters = self
@ -79,7 +79,7 @@ impl AdminRpcHandler {
.table .table
.get(&bucket_id, &EmptyKey) .get(&bucket_id, &EmptyKey)
.await? .await?
.map(|x| x.filtered_values(&self.garage.system.ring.borrow())) .map(|x| x.filtered_values(&self.garage.system.cluster_layout()))
.unwrap_or_default(); .unwrap_or_default();
let mut relevant_keys = HashMap::new(); let mut relevant_keys = HashMap::new();

View file

@ -18,7 +18,7 @@ use garage_util::error::Error as GarageError;
use garage_table::replication::*; use garage_table::replication::*;
use garage_table::*; use garage_table::*;
use garage_rpc::ring::PARTITION_BITS; use garage_rpc::layout::PARTITION_BITS;
use garage_rpc::*; use garage_rpc::*;
use garage_block::manager::BlockResyncErrorInfo; use garage_block::manager::BlockResyncErrorInfo;
@ -27,7 +27,6 @@ use garage_model::bucket_table::*;
use garage_model::garage::Garage; use garage_model::garage::Garage;
use garage_model::helper::error::{Error, OkOrBadRequest}; use garage_model::helper::error::{Error, OkOrBadRequest};
use garage_model::key_table::*; use garage_model::key_table::*;
use garage_model::migrate::Migrate;
use garage_model::s3::mpu_table::MultipartUpload; use garage_model::s3::mpu_table::MultipartUpload;
use garage_model::s3::version_table::Version; use garage_model::s3::version_table::Version;
@ -42,7 +41,6 @@ pub enum AdminRpc {
BucketOperation(BucketOperation), BucketOperation(BucketOperation),
KeyOperation(KeyOperation), KeyOperation(KeyOperation),
LaunchRepair(RepairOpt), LaunchRepair(RepairOpt),
Migrate(MigrateOpt),
Stats(StatsOpt), Stats(StatsOpt),
Worker(WorkerOperation), Worker(WorkerOperation),
BlockOperation(BlockOperation), BlockOperation(BlockOperation),
@ -96,24 +94,6 @@ impl AdminRpcHandler {
admin admin
} }
// ================ MIGRATION COMMANDS ====================
async fn handle_migrate(self: &Arc<Self>, opt: MigrateOpt) -> Result<AdminRpc, Error> {
if !opt.yes {
return Err(Error::BadRequest(
"Please provide the --yes flag to initiate migration operation.".to_string(),
));
}
let m = Migrate {
garage: self.garage.clone(),
};
match opt.what {
MigrateWhat::Buckets050 => m.migrate_buckets050().await,
}?;
Ok(AdminRpc::Ok("Migration successfull.".into()))
}
// ================ REPAIR COMMANDS ==================== // ================ REPAIR COMMANDS ====================
async fn handle_launch_repair(self: &Arc<Self>, opt: RepairOpt) -> Result<AdminRpc, Error> { async fn handle_launch_repair(self: &Arc<Self>, opt: RepairOpt) -> Result<AdminRpc, Error> {
@ -127,8 +107,8 @@ impl AdminRpcHandler {
opt_to_send.all_nodes = false; opt_to_send.all_nodes = false;
let mut failures = vec![]; let mut failures = vec![];
let ring = self.garage.system.ring.borrow().clone(); let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
for node in ring.layout.node_ids().iter() { for node in all_nodes.iter() {
let node = (*node).into(); let node = (*node).into();
let resp = self let resp = self
.endpoint .endpoint
@ -164,9 +144,9 @@ impl AdminRpcHandler {
async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRpc, Error> { async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRpc, Error> {
if opt.all_nodes { if opt.all_nodes {
let mut ret = String::new(); let mut ret = String::new();
let ring = self.garage.system.ring.borrow().clone(); let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
for node in ring.layout.node_ids().iter() { for node in all_nodes.iter() {
let mut opt = opt.clone(); let mut opt = opt.clone();
opt.all_nodes = false; opt.all_nodes = false;
opt.skip_global = true; opt.skip_global = true;
@ -218,11 +198,11 @@ impl AdminRpcHandler {
// Gather table statistics // Gather table statistics
let mut table = vec![" Table\tItems\tMklItems\tMklTodo\tGcTodo".into()]; let mut table = vec![" Table\tItems\tMklItems\tMklTodo\tGcTodo".into()];
table.push(self.gather_table_stats(&self.garage.bucket_table, opt.detailed)?); table.push(self.gather_table_stats(&self.garage.bucket_table)?);
table.push(self.gather_table_stats(&self.garage.key_table, opt.detailed)?); table.push(self.gather_table_stats(&self.garage.key_table)?);
table.push(self.gather_table_stats(&self.garage.object_table, opt.detailed)?); table.push(self.gather_table_stats(&self.garage.object_table)?);
table.push(self.gather_table_stats(&self.garage.version_table, opt.detailed)?); table.push(self.gather_table_stats(&self.garage.version_table)?);
table.push(self.gather_table_stats(&self.garage.block_ref_table, opt.detailed)?); table.push(self.gather_table_stats(&self.garage.block_ref_table)?);
write!( write!(
&mut ret, &mut ret,
"\nTable stats:\n{}", "\nTable stats:\n{}",
@ -232,15 +212,7 @@ impl AdminRpcHandler {
// Gather block manager statistics // Gather block manager statistics
writeln!(&mut ret, "\nBlock manager stats:").unwrap(); writeln!(&mut ret, "\nBlock manager stats:").unwrap();
let rc_len = if opt.detailed { let rc_len = self.garage.block_manager.rc_len()?.to_string();
self.garage.block_manager.rc_len()?.to_string()
} else {
self.garage
.block_manager
.rc_fast_len()?
.map(|x| x.to_string())
.unwrap_or_else(|| "NC".into())
};
writeln!( writeln!(
&mut ret, &mut ret,
@ -261,10 +233,6 @@ impl AdminRpcHandler {
) )
.unwrap(); .unwrap();
if !opt.detailed {
writeln!(&mut ret, "\nIf values are missing above (marked as NC), consider adding the --detailed flag (this will be slow).").unwrap();
}
if !opt.skip_global { if !opt.skip_global {
write!(&mut ret, "\n{}", self.gather_cluster_stats()).unwrap(); write!(&mut ret, "\n{}", self.gather_cluster_stats()).unwrap();
} }
@ -275,11 +243,11 @@ impl AdminRpcHandler {
fn gather_cluster_stats(&self) -> String { fn gather_cluster_stats(&self) -> String {
let mut ret = String::new(); let mut ret = String::new();
// Gather storage node and free space statistics // Gather storage node and free space statistics for current nodes
let layout = &self.garage.system.ring.borrow().layout; let layout = &self.garage.system.cluster_layout();
let mut node_partition_count = HashMap::<Uuid, u64>::new(); let mut node_partition_count = HashMap::<Uuid, u64>::new();
for short_id in layout.ring_assignment_data.iter() { for short_id in layout.current().ring_assignment_data.iter() {
let id = layout.node_id_vec[*short_id as usize]; let id = layout.current().node_id_vec[*short_id as usize];
*node_partition_count.entry(id).or_default() += 1; *node_partition_count.entry(id).or_default() += 1;
} }
let node_info = self let node_info = self
@ -294,8 +262,8 @@ impl AdminRpcHandler {
for (id, parts) in node_partition_count.iter() { for (id, parts) in node_partition_count.iter() {
let info = node_info.get(id); let info = node_info.get(id);
let status = info.map(|x| &x.status); let status = info.map(|x| &x.status);
let role = layout.roles.get(id).and_then(|x| x.0.as_ref()); let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref());
let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?");
let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?");
let capacity = role let capacity = role
.map(|x| x.capacity_string()) .map(|x| x.capacity_string())
@ -366,34 +334,13 @@ impl AdminRpcHandler {
ret ret
} }
fn gather_table_stats<F, R>( fn gather_table_stats<F, R>(&self, t: &Arc<Table<F, R>>) -> Result<String, Error>
&self,
t: &Arc<Table<F, R>>,
detailed: bool,
) -> Result<String, Error>
where where
F: TableSchema + 'static, F: TableSchema + 'static,
R: TableReplication + 'static, R: TableReplication + 'static,
{ {
let (data_len, mkl_len) = if detailed { let data_len = t.data.store.len().map_err(GarageError::from)?.to_string();
( let mkl_len = t.merkle_updater.merkle_tree_len()?.to_string();
t.data.store.len().map_err(GarageError::from)?.to_string(),
t.merkle_updater.merkle_tree_len()?.to_string(),
)
} else {
(
t.data
.store
.fast_len()
.map_err(GarageError::from)?
.map(|x| x.to_string())
.unwrap_or_else(|| "NC".into()),
t.merkle_updater
.merkle_tree_fast_len()?
.map(|x| x.to_string())
.unwrap_or_else(|| "NC".into()),
)
};
Ok(format!( Ok(format!(
" {}\t{}\t{}\t{}\t{}", " {}\t{}\t{}\t{}\t{}",
@ -441,8 +388,8 @@ impl AdminRpcHandler {
) -> Result<AdminRpc, Error> { ) -> Result<AdminRpc, Error> {
if all_nodes { if all_nodes {
let mut ret = vec![]; let mut ret = vec![];
let ring = self.garage.system.ring.borrow().clone(); let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
for node in ring.layout.node_ids().iter() { for node in all_nodes.iter() {
let node = (*node).into(); let node = (*node).into();
match self match self
.endpoint .endpoint
@ -489,8 +436,8 @@ impl AdminRpcHandler {
) -> Result<AdminRpc, Error> { ) -> Result<AdminRpc, Error> {
if all_nodes { if all_nodes {
let mut ret = vec![]; let mut ret = vec![];
let ring = self.garage.system.ring.borrow().clone(); let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
for node in ring.layout.node_ids().iter() { for node in all_nodes.iter() {
let node = (*node).into(); let node = (*node).into();
match self match self
.endpoint .endpoint
@ -525,8 +472,7 @@ impl AdminRpcHandler {
async fn handle_meta_cmd(self: &Arc<Self>, mo: &MetaOperation) -> Result<AdminRpc, Error> { async fn handle_meta_cmd(self: &Arc<Self>, mo: &MetaOperation) -> Result<AdminRpc, Error> {
match mo { match mo {
MetaOperation::Snapshot { all: true } => { MetaOperation::Snapshot { all: true } => {
let ring = self.garage.system.ring.borrow().clone(); let to = self.garage.system.cluster_layout().all_nodes().to_vec();
let to = ring.layout.node_ids().to_vec();
let resps = futures::future::join_all(to.iter().map(|to| async move { let resps = futures::future::join_all(to.iter().map(|to| async move {
let to = (*to).into(); let to = (*to).into();
@ -569,7 +515,6 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
match message { match message {
AdminRpc::BucketOperation(bo) => self.handle_bucket_cmd(bo).await, AdminRpc::BucketOperation(bo) => self.handle_bucket_cmd(bo).await,
AdminRpc::KeyOperation(ko) => self.handle_key_cmd(ko).await, AdminRpc::KeyOperation(ko) => self.handle_key_cmd(ko).await,
AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await,
AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await, AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await,
AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await, AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await, AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await,

View file

@ -1,4 +1,4 @@
use std::collections::HashSet; use std::collections::{HashMap, HashSet};
use std::time::Duration; use std::time::Duration;
use format_table::format_table; use format_table::format_table;
@ -33,9 +33,6 @@ pub async fn cli_command_dispatch(
Command::Key(ko) => { Command::Key(ko) => {
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::KeyOperation(ko)).await cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::KeyOperation(ko)).await
} }
Command::Migrate(mo) => {
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Migrate(mo)).await
}
Command::Repair(ro) => { Command::Repair(ro) => {
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await
} }
@ -52,50 +49,61 @@ pub async fn cli_command_dispatch(
} }
pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> { pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
let status = match rpc_cli let status = fetch_status(rpc_cli, rpc_host).await?;
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let layout = fetch_layout(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== HEALTHY NODES ===="); println!("==== HEALTHY NODES ====");
let mut healthy_nodes = let mut healthy_nodes =
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()];
for adv in status.iter().filter(|adv| adv.is_up) { for adv in status.iter().filter(|adv| adv.is_up) {
match layout.roles.get(&adv.id) { let host = adv.status.hostname.as_deref().unwrap_or("?");
Some(NodeRoleV(Some(cfg))) => { if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) {
let data_avail = match &adv.status.data_disk_avail { let data_avail = match &adv.status.data_disk_avail {
_ if cfg.capacity.is_none() => "N/A".into(), _ if cfg.capacity.is_none() => "N/A".into(),
Some((avail, total)) => { Some((avail, total)) => {
let pct = (*avail as f64) / (*total as f64) * 100.; let pct = (*avail as f64) / (*total as f64) * 100.;
let avail = bytesize::ByteSize::b(*avail); let avail = bytesize::ByteSize::b(*avail);
format!("{} ({:.1}%)", avail, pct) format!("{} ({:.1}%)", avail, pct)
} }
None => "?".into(), None => "?".into(),
}; };
healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}",
id = adv.id,
host = host,
addr = adv.addr,
tags = cfg.tags.join(","),
zone = cfg.zone,
capacity = cfg.capacity_string(),
data_avail = data_avail,
));
} else {
let prev_role = layout
.versions
.iter()
.rev()
.find_map(|x| match x.roles.get(&adv.id) {
Some(NodeRoleV(Some(cfg))) => Some(cfg),
_ => None,
});
if let Some(cfg) = prev_role {
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...",
id = adv.id, id = adv.id,
host = adv.status.hostname, host = host,
addr = adv.addr, addr = adv.addr,
tags = cfg.tags.join(","), tags = cfg.tags.join(","),
zone = cfg.zone, zone = cfg.zone,
capacity = cfg.capacity_string(),
data_avail = data_avail,
)); ));
} } else {
_ => { let new_role = match layout.staging.get().roles.get(&adv.id) {
let new_role = match layout.staging_roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "pending...",
Some(NodeRoleV(Some(_))) => "(pending)",
_ => "NO ROLE ASSIGNED", _ => "NO ROLE ASSIGNED",
}; };
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{h}\t{addr}\t{new_role}", "{id:?}\t{h}\t{addr}\t\t\t{new_role}",
id = adv.id, id = adv.id,
h = adv.status.hostname, h = host,
addr = adv.addr, addr = adv.addr,
new_role = new_role, new_role = new_role,
)); ));
@ -104,51 +112,76 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
} }
format_table(healthy_nodes); format_table(healthy_nodes);
let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>(); // Determine which nodes are unhealthy and print that to stdout
let failure_case_1 = status let status_map = status
.iter() .iter()
.any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_))))); .map(|adv| (adv.id, adv))
let failure_case_2 = layout .collect::<HashMap<_, _>>();
.roles
.items() let tf = timeago::Formatter::new();
.iter() let mut drain_msg = false;
.any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); let mut failed_nodes =
if failure_case_1 || failure_case_2 { vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
println!("\n==== FAILED NODES ===="); let mut listed = HashSet::new();
let mut failed_nodes = for ver in layout.versions.iter().rev() {
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; for (node, _, role) in ver.roles.items().iter() {
for adv in status.iter().filter(|adv| !adv.is_up) { let cfg = match role {
if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { NodeRoleV(Some(role)) if role.capacity.is_some() => role,
let tf = timeago::Formatter::new(); _ => continue,
failed_nodes.push(format!( };
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
id = adv.id, if listed.contains(node) {
host = adv.status.hostname, continue;
addr = adv.addr, }
tags = cfg.tags.join(","), listed.insert(*node);
zone = cfg.zone,
capacity = cfg.capacity_string(), let adv = status_map.get(node);
last_seen = adv if adv.map(|x| x.is_up).unwrap_or(false) {
.last_seen_secs_ago continue;
}
// Node is in a layout version, is not a gateway node, and is not up:
// it is in a failed state, add proper line to the output
let (host, addr, last_seen) = match adv {
Some(adv) => (
adv.status.hostname.as_deref().unwrap_or("?"),
adv.addr.to_string(),
adv.last_seen_secs_ago
.map(|s| tf.convert(Duration::from_secs(s))) .map(|s| tf.convert(Duration::from_secs(s)))
.unwrap_or_else(|| "never seen".into()), .unwrap_or_else(|| "never seen".into()),
)); ),
} None => ("??", "??".into(), "never seen".into()),
} };
for (id, _, role_v) in layout.roles.items().iter() { let capacity = if ver.version == layout.current().version {
if let NodeRoleV(Some(cfg)) = role_v { cfg.capacity_string()
if !status_keys.contains(id) { } else {
failed_nodes.push(format!( drain_msg = true;
"{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", "draining metadata...".to_string()
id = id, };
tags = cfg.tags.join(","), failed_nodes.push(format!(
zone = cfg.zone, "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
capacity = cfg.capacity_string(), id = node,
)); host = host,
} addr = addr,
} tags = cfg.tags.join(","),
zone = cfg.zone,
capacity = capacity,
last_seen = last_seen,
));
} }
}
if failed_nodes.len() > 1 {
println!("\n==== FAILED NODES ====");
format_table(failed_nodes); format_table(failed_nodes);
if drain_msg {
println!();
println!("Your cluster is expecting to drain data from nodes that are currently unavailable.");
println!("If these nodes are definitely dead, please review the layout history with");
println!(
"`garage layout history` and use `garage layout skip-dead-nodes` to force progress."
);
}
} }
if print_staging_role_changes(&layout) { if print_staging_role_changes(&layout) {
@ -229,3 +262,18 @@ pub async fn cmd_admin(
} }
Ok(()) Ok(())
} }
// ---- utility ----
pub async fn fetch_status(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
) -> Result<Vec<KnownNodeInfo>, Error> {
match rpc_cli
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),
resp => Err(Error::unexpected_rpc_message(resp)),
}
}

View file

@ -11,7 +11,7 @@ pub struct ConvertDbOpt {
/// https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0) /// https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0)
#[structopt(short = "i")] #[structopt(short = "i")]
input_path: PathBuf, input_path: PathBuf,
/// Input database engine (sled, lmdb or sqlite; limited by db engines /// Input database engine (lmdb or sqlite; limited by db engines
/// enabled in this build) /// enabled in this build)
#[structopt(short = "a")] #[structopt(short = "a")]
input_engine: Engine, input_engine: Engine,

View file

@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch(
LayoutOperation::Config(config_opt) => { LayoutOperation::Config(config_opt) => {
cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await
} }
LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await,
LayoutOperation::SkipDeadNodes(assume_sync_opt) => {
cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await
}
} }
} }
@ -49,6 +53,7 @@ pub async fn cmd_assign_role(
}; };
let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
let all_nodes = layout.get_all_nodes();
let added_nodes = args let added_nodes = args
.node_ids .node_ids
@ -58,21 +63,23 @@ pub async fn cmd_assign_role(
status status
.iter() .iter()
.map(|adv| adv.id) .map(|adv| adv.id)
.chain(layout.node_ids().iter().cloned()), .chain(all_nodes.iter().cloned()),
node_id, node_id,
) )
}) })
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
let mut roles = layout.roles.clone(); let mut roles = layout.current().roles.clone();
roles.merge(&layout.staging_roles); roles.merge(&layout.staging.get().roles);
for replaced in args.replace.iter() { for replaced in args.replace.iter() {
let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?;
match roles.get(&replaced_node) { match roles.get(&replaced_node) {
Some(NodeRoleV(Some(_))) => { Some(NodeRoleV(Some(_))) => {
layout layout
.staging_roles .staging
.get_mut()
.roles
.merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); .merge(&roles.update_mutator(replaced_node, NodeRoleV(None)));
} }
_ => { _ => {
@ -130,7 +137,9 @@ pub async fn cmd_assign_role(
}; };
layout layout
.staging_roles .staging
.get_mut()
.roles
.merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry))));
} }
@ -149,14 +158,16 @@ pub async fn cmd_remove_role(
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
let mut roles = layout.roles.clone(); let mut roles = layout.current().roles.clone();
roles.merge(&layout.staging_roles); roles.merge(&layout.staging.get().roles);
let deleted_node = let deleted_node =
find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?;
layout layout
.staging_roles .staging
.get_mut()
.roles
.merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); .merge(&roles.update_mutator(deleted_node, NodeRoleV(None)));
send_layout(rpc_cli, rpc_host, layout).await?; send_layout(rpc_cli, rpc_host, layout).await?;
@ -174,13 +185,16 @@ pub async fn cmd_show_layout(
let layout = fetch_layout(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== CURRENT CLUSTER LAYOUT ===="); println!("==== CURRENT CLUSTER LAYOUT ====");
print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
println!(); println!();
println!("Current cluster layout version: {}", layout.version); println!(
"Current cluster layout version: {}",
layout.current().version
);
let has_role_changes = print_staging_role_changes(&layout); let has_role_changes = print_staging_role_changes(&layout);
if has_role_changes { if has_role_changes {
let v = layout.version; let v = layout.current().version;
let res_apply = layout.apply_staged_changes(Some(v + 1)); let res_apply = layout.apply_staged_changes(Some(v + 1));
// this will print the stats of what partitions // this will print the stats of what partitions
@ -189,7 +203,7 @@ pub async fn cmd_show_layout(
Ok((layout, msg)) => { Ok((layout, msg)) => {
println!(); println!();
println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
print_cluster_layout(&layout, "No nodes have a role in the new layout."); print_cluster_layout(layout.current(), "No nodes have a role in the new layout.");
println!(); println!();
for line in msg.iter() { for line in msg.iter() {
@ -199,16 +213,12 @@ pub async fn cmd_show_layout(
println!(); println!();
println!(" garage layout apply --version {}", v + 1); println!(" garage layout apply --version {}", v + 1);
println!(); println!();
println!( println!("You can also revert all proposed changes with: garage layout revert");
"You can also revert all proposed changes with: garage layout revert --version {}",
v + 1)
} }
Err(e) => { Err(e) => {
println!("Error while trying to compute the assignment: {}", e); println!("Error while trying to compute the assignment: {}", e);
println!("This new layout cannot yet be applied."); println!("This new layout cannot yet be applied.");
println!( println!("You can also revert all proposed changes with: garage layout revert");
"You can also revert all proposed changes with: garage layout revert --version {}",
v + 1)
} }
} }
} }
@ -241,9 +251,15 @@ pub async fn cmd_revert_layout(
rpc_host: NodeID, rpc_host: NodeID,
revert_opt: RevertLayoutOpt, revert_opt: RevertLayoutOpt,
) -> Result<(), Error> { ) -> Result<(), Error> {
if !revert_opt.yes {
return Err(Error::Message(
"Please add the --yes flag to run the layout revert operation".into(),
));
}
let layout = fetch_layout(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?;
let layout = layout.revert_staged_changes(revert_opt.version)?; let layout = layout.revert_staged_changes()?;
send_layout(rpc_cli, rpc_host, layout).await?; send_layout(rpc_cli, rpc_host, layout).await?;
@ -266,11 +282,11 @@ pub async fn cmd_config_layout(
.parse::<ZoneRedundancy>() .parse::<ZoneRedundancy>()
.ok_or_message("invalid zone redundancy value")?; .ok_or_message("invalid zone redundancy value")?;
if let ZoneRedundancy::AtLeast(r_int) = r { if let ZoneRedundancy::AtLeast(r_int) = r {
if r_int > layout.replication_factor { if r_int > layout.current().replication_factor {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"The zone redundancy must be smaller or equal to the \ "The zone redundancy must be smaller or equal to the \
replication factor ({}).", replication factor ({}).",
layout.replication_factor layout.current().replication_factor
))); )));
} else if r_int < 1 { } else if r_int < 1 {
return Err(Error::Message( return Err(Error::Message(
@ -280,7 +296,9 @@ pub async fn cmd_config_layout(
} }
layout layout
.staging_parameters .staging
.get_mut()
.parameters
.update(LayoutParameters { zone_redundancy: r }); .update(LayoutParameters { zone_redundancy: r });
println!("The zone redundancy parameter has been set to '{}'.", r); println!("The zone redundancy parameter has been set to '{}'.", r);
did_something = true; did_something = true;
@ -297,25 +315,166 @@ pub async fn cmd_config_layout(
Ok(()) Ok(())
} }
pub async fn cmd_layout_history(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
) -> Result<(), Error> {
let layout = fetch_layout(rpc_cli, rpc_host).await?;
let min_stored = layout.min_stored();
println!("==== LAYOUT HISTORY ====");
let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()];
for ver in layout
.versions
.iter()
.rev()
.chain(layout.old_versions.iter().rev())
{
let status = if ver.version == layout.current().version {
"current"
} else if ver.version >= min_stored {
"draining"
} else {
"historical"
};
table.push(format!(
"#{}\t{}\t{}\t{}",
ver.version,
status,
ver.roles
.items()
.iter()
.filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some()))
.count(),
ver.roles
.items()
.iter()
.filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none()))
.count(),
));
}
format_table(table);
println!();
if layout.versions.len() > 1 {
println!("==== UPDATE TRACKERS ====");
println!("Several layout versions are currently live in the version, and data is being migrated.");
println!(
"This is the internal data that Garage stores to know which nodes have what data."
);
println!();
let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()];
let all_nodes = layout.get_all_nodes();
for node in all_nodes.iter() {
table.push(format!(
"{:?}\t#{}\t#{}\t#{}",
node,
layout.update_trackers.ack_map.get(node, min_stored),
layout.update_trackers.sync_map.get(node, min_stored),
layout.update_trackers.sync_ack_map.get(node, min_stored),
));
}
table[1..].sort();
format_table(table);
println!();
println!(
"If some nodes are not catching up to the latest layout version in the update trackers,"
);
println!("it might be because they are offline or unable to complete a sync successfully.");
println!(
"You may force progress using `garage layout skip-dead-nodes --version {}`",
layout.current().version
);
} else {
println!("Your cluster is currently in a stable state with a single live layout version.");
println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,");
println!(
"so you might want to keep old nodes online until their data directories become empty."
);
}
Ok(())
}
pub async fn cmd_layout_skip_dead_nodes(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
opt: SkipDeadNodesOpt,
) -> Result<(), Error> {
let status = fetch_status(rpc_cli, rpc_host).await?;
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
if layout.versions.len() == 1 {
return Err(Error::Message(
"This command cannot be called when there is only one live cluster layout version"
.into(),
));
}
let min_v = layout.min_stored();
if opt.version <= min_v || opt.version > layout.current().version {
return Err(Error::Message(format!(
"Invalid version, you may use the following version numbers: {}",
(min_v + 1..=layout.current().version)
.map(|x| x.to_string())
.collect::<Vec<_>>()
.join(" ")
)));
}
let all_nodes = layout.get_all_nodes();
let mut did_something = false;
for node in all_nodes.iter() {
if status.iter().any(|x| x.id == *node && x.is_up) {
continue;
}
if layout.update_trackers.ack_map.set_max(*node, opt.version) {
println!("Increased the ACK tracker for node {:?}", node);
did_something = true;
}
if opt.allow_missing_data {
if layout.update_trackers.sync_map.set_max(*node, opt.version) {
println!("Increased the SYNC tracker for node {:?}", node);
did_something = true;
}
}
}
if did_something {
send_layout(rpc_cli, rpc_host, layout).await?;
println!("Success.");
Ok(())
} else if !opt.allow_missing_data {
Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into()))
} else {
Err(Error::Message(
"Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(),
))
}
}
// --- utility --- // --- utility ---
pub async fn fetch_layout( pub async fn fetch_layout(
rpc_cli: &Endpoint<SystemRpc, ()>, rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID, rpc_host: NodeID,
) -> Result<ClusterLayout, Error> { ) -> Result<LayoutHistory, Error> {
match rpc_cli match rpc_cli
.call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL)
.await?? .await??
{ {
SystemRpc::AdvertiseClusterLayout(t) => Ok(t), SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), resp => Err(Error::unexpected_rpc_message(resp)),
} }
} }
pub async fn send_layout( pub async fn send_layout(
rpc_cli: &Endpoint<SystemRpc, ()>, rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID, rpc_host: NodeID,
layout: ClusterLayout, layout: LayoutHistory,
) -> Result<(), Error> { ) -> Result<(), Error> {
rpc_cli rpc_cli
.call( .call(
@ -327,7 +486,7 @@ pub async fn send_layout(
Ok(()) Ok(())
} }
pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) {
let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
for (id, _, role) in layout.roles.items().iter() { for (id, _, role) in layout.roles.items().iter() {
let role = match &role.0 { let role = match &role.0 {
@ -366,21 +525,22 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
} }
} }
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool {
let has_role_changes = layout let staging = layout.staging.get();
.staging_roles let has_role_changes = staging
.roles
.items() .items()
.iter() .iter()
.any(|(k, _, v)| layout.roles.get(k) != Some(v)); .any(|(k, _, v)| layout.current().roles.get(k) != Some(v));
let has_layout_changes = *layout.staging_parameters.get() != layout.parameters; let has_layout_changes = *staging.parameters.get() != layout.current().parameters;
if has_role_changes || has_layout_changes { if has_role_changes || has_layout_changes {
println!(); println!();
println!("==== STAGED ROLE CHANGES ===="); println!("==== STAGED ROLE CHANGES ====");
if has_role_changes { if has_role_changes {
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
for (id, _, role) in layout.staging_roles.items().iter() { for (id, _, role) in staging.roles.items().iter() {
if layout.roles.get(id) == Some(role) { if layout.current().roles.get(id) == Some(role) {
continue; continue;
} }
if let Some(role) = &role.0 { if let Some(role) = &role.0 {
@ -402,7 +562,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
if has_layout_changes { if has_layout_changes {
println!( println!(
"Zone redundancy: {}", "Zone redundancy: {}",
layout.staging_parameters.get().zone_redundancy staging.parameters.get().zone_redundancy
); );
} }
true true

View file

@ -31,11 +31,6 @@ pub enum Command {
#[structopt(name = "key", version = garage_version())] #[structopt(name = "key", version = garage_version())]
Key(KeyOperation), Key(KeyOperation),
/// Run migrations from previous Garage version
/// (DO NOT USE WITHOUT READING FULL DOCUMENTATION)
#[structopt(name = "migrate", version = garage_version())]
Migrate(MigrateOpt),
/// Start repair of node data on remote node /// Start repair of node data on remote node
#[structopt(name = "repair", version = garage_version())] #[structopt(name = "repair", version = garage_version())]
Repair(RepairOpt), Repair(RepairOpt),
@ -118,6 +113,14 @@ pub enum LayoutOperation {
/// Revert staged changes to cluster layout /// Revert staged changes to cluster layout
#[structopt(name = "revert", version = garage_version())] #[structopt(name = "revert", version = garage_version())]
Revert(RevertLayoutOpt), Revert(RevertLayoutOpt),
/// View the history of layouts in the cluster
#[structopt(name = "history", version = garage_version())]
History,
/// Skip dead nodes when awaiting for a new layout version to be synchronized
#[structopt(name = "skip-dead-nodes", version = garage_version())]
SkipDeadNodes(SkipDeadNodesOpt),
} }
#[derive(StructOpt, Debug)] #[derive(StructOpt, Debug)]
@ -170,9 +173,21 @@ pub struct ApplyLayoutOpt {
#[derive(StructOpt, Debug)] #[derive(StructOpt, Debug)]
pub struct RevertLayoutOpt { pub struct RevertLayoutOpt {
/// Version number of old configuration to which to revert /// The revert operation will not be ran unless this flag is added
#[structopt(long = "yes")]
pub(crate) yes: bool,
}
#[derive(StructOpt, Debug)]
pub struct SkipDeadNodesOpt {
/// Version number of the layout to assume is currently up-to-date.
/// This will generally be the current layout version.
#[structopt(long = "version")] #[structopt(long = "version")]
pub(crate) version: Option<u64>, pub(crate) version: u64,
/// Allow the skip even if a quorum of ndoes could not be found for
/// the data among the remaining nodes
#[structopt(long = "allow-missing-data")]
pub(crate) allow_missing_data: bool,
} }
#[derive(Serialize, Deserialize, StructOpt, Debug)] #[derive(Serialize, Deserialize, StructOpt, Debug)]
@ -429,23 +444,6 @@ pub struct KeyImportOpt {
pub yes: bool, pub yes: bool,
} }
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct MigrateOpt {
/// Confirm the launch of the migrate operation
#[structopt(long = "yes")]
pub yes: bool,
#[structopt(subcommand)]
pub what: MigrateWhat,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum MigrateWhat {
/// Migrate buckets and permissions from v0.5.0
#[structopt(name = "buckets050", version = garage_version())]
Buckets050,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct RepairOpt { pub struct RepairOpt {
/// Launch repair operation on all nodes /// Launch repair operation on all nodes
@ -537,10 +535,6 @@ pub struct StatsOpt {
#[structopt(short = "a", long = "all-nodes")] #[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool, pub all_nodes: bool,
/// Gather detailed statistics (this can be long)
#[structopt(short = "d", long = "detailed")]
pub detailed: bool,
/// Don't show global cluster stats (internal use in RPC) /// Don't show global cluster stats (internal use in RPC)
#[structopt(skip)] #[structopt(skip)]
#[serde(default)] #[serde(default)]

View file

@ -450,6 +450,8 @@ pub fn print_block_info(
if refcount != nondeleted_count { if refcount != nondeleted_count {
println!(); println!();
println!("Warning: refcount does not match number of non-deleted versions"); println!(
"Warning: refcount does not match number of non-deleted versions (see issue #644)."
);
} }
} }

View file

@ -18,8 +18,8 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled
#[cfg(all(feature = "bundled-libs", feature = "system-libs"))] #[cfg(all(feature = "bundled-libs", feature = "system-libs"))]
compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled");
#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] #[cfg(not(any(feature = "lmdb", feature = "sqlite")))]
compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb or sqlite.");
use std::net::SocketAddr; use std::net::SocketAddr;
use std::path::PathBuf; use std::path::PathBuf;
@ -72,8 +72,6 @@ async fn main() {
let features = &[ let features = &[
#[cfg(feature = "k2v")] #[cfg(feature = "k2v")]
"k2v", "k2v",
#[cfg(feature = "sled")]
"sled",
#[cfg(feature = "lmdb")] #[cfg(feature = "lmdb")]
"lmdb", "lmdb",
#[cfg(feature = "sqlite")] #[cfg(feature = "sqlite")]

View file

@ -163,7 +163,7 @@ mod tests {
r#" r#"
metadata_dir = "/tmp/garage/meta" metadata_dir = "/tmp/garage/meta"
data_dir = "/tmp/garage/data" data_dir = "/tmp/garage/data"
replication_mode = "3" replication_factor = 3
rpc_bind_addr = "[::]:3901" rpc_bind_addr = "[::]:3901"
rpc_secret_file = "{}" rpc_secret_file = "{}"
@ -185,7 +185,7 @@ mod tests {
r#" r#"
metadata_dir = "/tmp/garage/meta" metadata_dir = "/tmp/garage/meta"
data_dir = "/tmp/garage/data" data_dir = "/tmp/garage/data"
replication_mode = "3" replication_factor = 3
rpc_bind_addr = "[::]:3901" rpc_bind_addr = "[::]:3901"
rpc_secret_file = "{}" rpc_secret_file = "{}"
allow_world_readable_secrets = true allow_world_readable_secrets = true
@ -296,7 +296,7 @@ mod tests {
r#" r#"
metadata_dir = "/tmp/garage/meta" metadata_dir = "/tmp/garage/meta"
data_dir = "/tmp/garage/data" data_dir = "/tmp/garage/data"
replication_mode = "3" replication_factor = 3
rpc_bind_addr = "[::]:3901" rpc_bind_addr = "[::]:3901"
rpc_secret= "dummy" rpc_secret= "dummy"
rpc_secret_file = "dummy" rpc_secret_file = "dummy"

View file

@ -14,42 +14,20 @@ impl CommandExt for process::Command {
} }
fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus { fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus {
let status = self.status().expect(msg); self.expect_success_output(msg).status
status.expect_success(msg);
status
} }
fn expect_success_output(&mut self, msg: &str) -> process::Output { fn expect_success_output(&mut self, msg: &str) -> process::Output {
let output = self.output().expect(msg); let output = self.output().expect(msg);
output.expect_success(msg); if !output.status.success() {
panic!(
"{}: command {:?} exited with error {:?}\nSTDOUT: {}\nSTDERR: {}",
msg,
self,
output.status.code(),
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
);
}
output output
} }
} }
pub trait OutputExt {
fn expect_success(&self, msg: &str);
}
impl OutputExt for process::Output {
fn expect_success(&self, msg: &str) {
self.status.expect_success(msg)
}
}
pub trait ExitStatusExt {
fn expect_success(&self, msg: &str);
}
impl ExitStatusExt for process::ExitStatus {
fn expect_success(&self, msg: &str) {
if !self.success() {
match self.code() {
Some(code) => panic!(
"Command exited with code {code}: {msg}",
code = code,
msg = msg
),
None => panic!("Command exited with signal: {msg}", msg = msg),
}
}
}
}

View file

@ -58,7 +58,7 @@ metadata_dir = "{path}/meta"
data_dir = "{path}/data" data_dir = "{path}/data"
db_engine = "{db_engine}" db_engine = "{db_engine}"
replication_mode = "1" replication_factor = 1
rpc_bind_addr = "127.0.0.1:{rpc_port}" rpc_bind_addr = "127.0.0.1:{rpc_port}"
rpc_public_addr = "127.0.0.1:{rpc_port}" rpc_public_addr = "127.0.0.1:{rpc_port}"
@ -100,7 +100,7 @@ api_bind_addr = "127.0.0.1:{admin_port}"
.arg("server") .arg("server")
.stdout(stdout) .stdout(stdout)
.stderr(stderr) .stderr(stderr)
.env("RUST_LOG", "garage=info,garage_api=trace") .env("RUST_LOG", "garage=debug,garage_api=trace")
.spawn() .spawn()
.expect("Could not start garage"); .expect("Could not start garage");

View file

@ -3,5 +3,6 @@ mod multipart;
mod objects; mod objects;
mod presigned; mod presigned;
mod simple; mod simple;
mod ssec;
mod streaming_signature; mod streaming_signature;
mod website; mod website;

455
src/garage/tests/s3/ssec.rs Normal file
View file

@ -0,0 +1,455 @@
use crate::common::{self, Context};
use aws_sdk_s3::primitives::ByteStream;
use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart};
const SSEC_KEY: &str = "u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y=";
const SSEC_KEY_MD5: &str = "jMGbs3GyZkYjJUP6q5jA7g==";
const SSEC_KEY2: &str = "XkYVk4Z3vVDO2yJaUqCAEZX6lL10voMxtV06d8my/eU=";
const SSEC_KEY2_MD5: &str = "kedo2ab8J1MCjHwJuLTJHw==";
const SZ_2MB: usize = 2 * 1024 * 1024;
#[tokio::test]
async fn test_ssec_object() {
let ctx = common::context();
let bucket = ctx.create_bucket("sse-c");
let bytes1 = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".to_vec();
let bytes2 = (0..400000)
.map(|x| ((x * 3792) % 256) as u8)
.collect::<Vec<u8>>();
for data in vec![bytes1, bytes2] {
let stream = ByteStream::new(data.clone().into());
// Write encrypted object
let r = ctx
.client
.put_object()
.bucket(&bucket)
.key("testobj")
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY)
.sse_customer_key_md5(SSEC_KEY_MD5)
.body(stream)
.send()
.await
.unwrap();
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into()));
test_read_encrypted(
&ctx,
&bucket,
"testobj",
&data,
SSEC_KEY,
SSEC_KEY_MD5,
SSEC_KEY2,
SSEC_KEY2_MD5,
)
.await;
// Test copy from encrypted to non-encrypted
let r = ctx
.client
.copy_object()
.bucket(&bucket)
.key("test-copy-enc-dec")
.copy_source(format!("{}/{}", bucket, "testobj"))
.copy_source_sse_customer_algorithm("AES256")
.copy_source_sse_customer_key(SSEC_KEY)
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
.send()
.await
.unwrap();
assert_eq!(r.sse_customer_algorithm, None);
assert_eq!(r.sse_customer_key_md5, None);
// Test read decrypted file
let r = ctx
.client
.get_object()
.bucket(&bucket)
.key("test-copy-enc-dec")
.send()
.await
.unwrap();
assert_bytes_eq!(r.body, &data);
assert_eq!(r.sse_customer_algorithm, None);
assert_eq!(r.sse_customer_key_md5, None);
// Test copy from non-encrypted to encrypted
let r = ctx
.client
.copy_object()
.bucket(&bucket)
.key("test-copy-enc-dec-enc")
.copy_source(format!("{}/test-copy-enc-dec", bucket))
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.send()
.await
.unwrap();
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into()));
test_read_encrypted(
&ctx,
&bucket,
"test-copy-enc-dec-enc",
&data,
SSEC_KEY2,
SSEC_KEY2_MD5,
SSEC_KEY,
SSEC_KEY_MD5,
)
.await;
// Test copy from encrypted to encrypted with different keys
let r = ctx
.client
.copy_object()
.bucket(&bucket)
.key("test-copy-enc-enc")
.copy_source(format!("{}/{}", bucket, "testobj"))
.copy_source_sse_customer_algorithm("AES256")
.copy_source_sse_customer_key(SSEC_KEY)
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.send()
.await
.unwrap();
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into()));
test_read_encrypted(
&ctx,
&bucket,
"test-copy-enc-enc",
&data,
SSEC_KEY2,
SSEC_KEY2_MD5,
SSEC_KEY,
SSEC_KEY_MD5,
)
.await;
// Test copy from encrypted to encrypted with the same key
let r = ctx
.client
.copy_object()
.bucket(&bucket)
.key("test-copy-enc-enc-same")
.copy_source(format!("{}/{}", bucket, "testobj"))
.copy_source_sse_customer_algorithm("AES256")
.copy_source_sse_customer_key(SSEC_KEY)
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY)
.sse_customer_key_md5(SSEC_KEY_MD5)
.send()
.await
.unwrap();
assert_eq!(r.sse_customer_algorithm, Some("AES256".into()));
assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into()));
test_read_encrypted(
&ctx,
&bucket,
"test-copy-enc-enc-same",
&data,
SSEC_KEY,
SSEC_KEY_MD5,
SSEC_KEY2,
SSEC_KEY2_MD5,
)
.await;
}
}
#[tokio::test]
async fn test_multipart_upload() {
let ctx = common::context();
let bucket = ctx.create_bucket("test-ssec-mpu");
let u1 = vec![0x11; SZ_2MB];
let u2 = vec![0x22; SZ_2MB];
let u3 = vec![0x33; SZ_2MB];
let all = [&u1[..], &u2[..], &u3[..]].concat();
// Test simple encrypted mpu
{
let up = ctx
.client
.create_multipart_upload()
.bucket(&bucket)
.key("a")
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY)
.sse_customer_key_md5(SSEC_KEY_MD5)
.send()
.await
.unwrap();
assert!(up.upload_id.is_some());
assert_eq!(up.sse_customer_algorithm, Some("AES256".into()));
assert_eq!(up.sse_customer_key_md5, Some(SSEC_KEY_MD5.into()));
let uid = up.upload_id.as_ref().unwrap();
let mut etags = vec![];
for (i, part) in vec![&u1, &u2, &u3].into_iter().enumerate() {
let pu = ctx
.client
.upload_part()
.bucket(&bucket)
.key("a")
.upload_id(uid)
.part_number((i + 1) as i32)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY)
.sse_customer_key_md5(SSEC_KEY_MD5)
.body(ByteStream::from(part.to_vec()))
.send()
.await
.unwrap();
etags.push(pu.e_tag.unwrap());
}
let mut cmp = CompletedMultipartUpload::builder();
for (i, etag) in etags.into_iter().enumerate() {
cmp = cmp.parts(
CompletedPart::builder()
.part_number((i + 1) as i32)
.e_tag(etag)
.build(),
);
}
ctx.client
.complete_multipart_upload()
.bucket(&bucket)
.key("a")
.upload_id(uid)
.multipart_upload(cmp.build())
.send()
.await
.unwrap();
test_read_encrypted(
&ctx,
&bucket,
"a",
&all,
SSEC_KEY,
SSEC_KEY_MD5,
SSEC_KEY2,
SSEC_KEY2_MD5,
)
.await;
}
// Test upload part copy from first object
{
// (setup) Upload a single part object
ctx.client
.put_object()
.bucket(&bucket)
.key("b")
.body(ByteStream::from(u1.clone()))
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.send()
.await
.unwrap();
let up = ctx
.client
.create_multipart_upload()
.bucket(&bucket)
.key("target")
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.send()
.await
.unwrap();
let uid = up.upload_id.as_ref().unwrap();
let p1 = ctx
.client
.upload_part()
.bucket(&bucket)
.key("target")
.upload_id(uid)
.part_number(1)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.body(ByteStream::from(u3.clone()))
.send()
.await
.unwrap();
let p2 = ctx
.client
.upload_part_copy()
.bucket(&bucket)
.key("target")
.upload_id(uid)
.part_number(2)
.copy_source(format!("{}/a", bucket))
.copy_source_range("bytes=500-550000")
.copy_source_sse_customer_algorithm("AES256")
.copy_source_sse_customer_key(SSEC_KEY)
.copy_source_sse_customer_key_md5(SSEC_KEY_MD5)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.send()
.await
.unwrap();
let p3 = ctx
.client
.upload_part()
.bucket(&bucket)
.key("target")
.upload_id(uid)
.part_number(3)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.body(ByteStream::from(u2.clone()))
.send()
.await
.unwrap();
let p4 = ctx
.client
.upload_part_copy()
.bucket(&bucket)
.key("target")
.upload_id(uid)
.part_number(4)
.copy_source(format!("{}/b", bucket))
.copy_source_range("bytes=1500-20500")
.copy_source_sse_customer_algorithm("AES256")
.copy_source_sse_customer_key(SSEC_KEY2)
.copy_source_sse_customer_key_md5(SSEC_KEY2_MD5)
.sse_customer_algorithm("AES256")
.sse_customer_key(SSEC_KEY2)
.sse_customer_key_md5(SSEC_KEY2_MD5)
.send()
.await
.unwrap();
let cmp = CompletedMultipartUpload::builder()
.parts(
CompletedPart::builder()
.part_number(1)
.e_tag(p1.e_tag.unwrap())
.build(),
)
.parts(
CompletedPart::builder()
.part_number(2)
.e_tag(p2.copy_part_result.unwrap().e_tag.unwrap())
.build(),
)
.parts(
CompletedPart::builder()
.part_number(3)
.e_tag(p3.e_tag.unwrap())
.build(),
)
.parts(
CompletedPart::builder()
.part_number(4)
.e_tag(p4.copy_part_result.unwrap().e_tag.unwrap())
.build(),
)
.build();
ctx.client
.complete_multipart_upload()
.bucket(&bucket)
.key("target")
.upload_id(uid)
.multipart_upload(cmp)
.send()
.await
.unwrap();
// (check) Get object
let expected = [&u3[..], &all[500..550001], &u2[..], &u1[1500..20501]].concat();
test_read_encrypted(
&ctx,
&bucket,
"target",
&expected,
SSEC_KEY2,
SSEC_KEY2_MD5,
SSEC_KEY,
SSEC_KEY_MD5,
)
.await;
}
}
async fn test_read_encrypted(
ctx: &Context,
bucket: &str,
obj_key: &str,
expected_data: &[u8],
enc_key: &str,
enc_key_md5: &str,
wrong_enc_key: &str,
wrong_enc_key_md5: &str,
) {
// Test read encrypted without key
let o = ctx
.client
.get_object()
.bucket(bucket)
.key(obj_key)
.send()
.await;
assert!(
o.is_err(),
"encrypted file could be read without encryption key"
);
// Test read encrypted with wrong key
let o = ctx
.client
.get_object()
.bucket(bucket)
.key(obj_key)
.sse_customer_key(wrong_enc_key)
.sse_customer_key_md5(wrong_enc_key_md5)
.send()
.await;
assert!(
o.is_err(),
"encrypted file could be read with incorrect encryption key"
);
// Test read encrypted with correct key
let o = ctx
.client
.get_object()
.bucket(bucket)
.key(obj_key)
.sse_customer_algorithm("AES256")
.sse_customer_key(enc_key)
.sse_customer_key_md5(enc_key_md5)
.send()
.await
.unwrap();
assert_bytes_eq!(o.body, expected_data);
assert_eq!(o.sse_customer_algorithm, Some("AES256".into()));
assert_eq!(o.sse_customer_key_md5, Some(enc_key_md5.to_string()));
}

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_model" name = "garage_model"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -27,6 +27,7 @@ blake2.workspace = true
chrono.workspace = true chrono.workspace = true
err-derive.workspace = true err-derive.workspace = true
hex.workspace = true hex.workspace = true
http.workspace = true
base64.workspace = true base64.workspace = true
parse_duration.workspace = true parse_duration.workspace = true
tracing.workspace = true tracing.workspace = true
@ -42,8 +43,7 @@ tokio.workspace = true
opentelemetry.workspace = true opentelemetry.workspace = true
[features] [features]
default = [ "sled", "lmdb", "sqlite" ] default = [ "lmdb", "sqlite" ]
k2v = [ "garage_util/k2v" ] k2v = [ "garage_util/k2v" ]
lmdb = [ "garage_db/lmdb" ] lmdb = [ "garage_db/lmdb" ]
sled = [ "garage_db/sled" ]
sqlite = [ "garage_db/sqlite" ] sqlite = [ "garage_db/sqlite" ]

View file

@ -10,7 +10,7 @@ use garage_util::config::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::persister::PersisterShared; use garage_util::persister::PersisterShared;
use garage_rpc::replication_mode::ReplicationMode; use garage_rpc::replication_mode::*;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_block::manager::*; use garage_block::manager::*;
@ -40,8 +40,8 @@ pub struct Garage {
/// The set of background variables that can be viewed/modified at runtime /// The set of background variables that can be viewed/modified at runtime
pub bg_vars: vars::BgVars, pub bg_vars: vars::BgVars,
/// The replication mode of this cluster /// The replication factor of this cluster
pub replication_mode: ReplicationMode, pub replication_factor: ReplicationFactor,
/// The local database /// The local database
pub db: db::Db, pub db: db::Db,
@ -118,9 +118,6 @@ impl Garage {
.ok_or_message("Invalid `db_engine` value in configuration file")?; .ok_or_message("Invalid `db_engine` value in configuration file")?;
let mut db_path = config.metadata_dir.clone(); let mut db_path = config.metadata_dir.clone();
match db_engine { match db_engine {
db::Engine::Sled => {
db_path.push("db");
}
db::Engine::Sqlite => { db::Engine::Sqlite => {
db_path.push("db.sqlite"); db_path.push("db.sqlite");
} }
@ -134,8 +131,6 @@ impl Garage {
v if v == usize::default() => None, v if v == usize::default() => None,
v => Some(v), v => Some(v),
}, },
sled_cache_capacity: config.sled_cache_capacity,
sled_flush_every_ms: config.sled_flush_every_ms,
}; };
let db = db::open_db(&db_path, db_engine, &db_opt) let db = db::open_db(&db_path, db_engine, &db_opt)
.ok_or_message("Unable to open metadata db")?; .ok_or_message("Unable to open metadata db")?;
@ -148,32 +143,30 @@ impl Garage {
.and_then(|x| NetworkKey::from_slice(&x)) .and_then(|x| NetworkKey::from_slice(&x))
.ok_or_message("Invalid RPC secret key")?; .ok_or_message("Invalid RPC secret key")?;
let replication_mode = ReplicationMode::parse(&config.replication_mode) let (replication_factor, consistency_mode) = parse_replication_mode(&config)?;
.ok_or_message("Invalid replication_mode in config file.")?;
info!("Initialize background variable system..."); info!("Initialize background variable system...");
let mut bg_vars = vars::BgVars::new(); let mut bg_vars = vars::BgVars::new();
info!("Initialize membership management system..."); info!("Initialize membership management system...");
let system = System::new(network_key, replication_mode, &config)?; let system = System::new(network_key, replication_factor, consistency_mode, &config)?;
let data_rep_param = TableShardedReplication { let data_rep_param = TableShardedReplication {
system: system.clone(), system: system.clone(),
replication_factor: replication_mode.replication_factor(), replication_factor: replication_factor.into(),
write_quorum: replication_mode.write_quorum(), write_quorum: replication_factor.write_quorum(consistency_mode),
read_quorum: 1, read_quorum: 1,
}; };
let meta_rep_param = TableShardedReplication { let meta_rep_param = TableShardedReplication {
system: system.clone(), system: system.clone(),
replication_factor: replication_mode.replication_factor(), replication_factor: replication_factor.into(),
write_quorum: replication_mode.write_quorum(), write_quorum: replication_factor.write_quorum(consistency_mode),
read_quorum: replication_mode.read_quorum(), read_quorum: replication_factor.read_quorum(consistency_mode),
}; };
let control_rep_param = TableFullReplication { let control_rep_param = TableFullReplication {
system: system.clone(), system: system.clone(),
max_faults: replication_mode.control_write_max_faults(),
}; };
info!("Initialize block manager..."); info!("Initialize block manager...");
@ -258,7 +251,7 @@ impl Garage {
Ok(Arc::new(Self { Ok(Arc::new(Self {
config, config,
bg_vars, bg_vars,
replication_mode, replication_factor,
db, db,
system, system,
block_manager, block_manager,

View file

@ -112,10 +112,12 @@ impl<'a> BucketHelper<'a> {
#[cfg(feature = "k2v")] #[cfg(feature = "k2v")]
{ {
use garage_rpc::ring::Ring; let node_id_vec = self
use std::sync::Arc; .0
.system
let ring: Arc<Ring> = self.0.system.ring.borrow().clone(); .cluster_layout()
.all_nongateway_nodes()
.to_vec();
let k2vindexes = self let k2vindexes = self
.0 .0
.k2v .k2v
@ -124,7 +126,7 @@ impl<'a> BucketHelper<'a> {
.get_range( .get_range(
&bucket_id, &bucket_id,
None, None,
Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), Some((DeletedFilter::NotDeleted, node_id_vec)),
10, 10,
EnumerationOrder::Forward, EnumerationOrder::Forward,
) )

View file

@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
use garage_db as db; use garage_db as db;
use garage_rpc::ring::Ring; use garage_rpc::layout::LayoutHelper;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_util::background::BackgroundRunner; use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
@ -83,9 +83,9 @@ impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> {
} }
impl<T: CountedItem> CounterEntry<T> { impl<T: CountedItem> CounterEntry<T> {
pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> { pub fn filtered_values(&self, layout: &LayoutHelper) -> HashMap<String, i64> {
let nodes = &ring.layout.node_id_vec[..]; let nodes = layout.all_nongateway_nodes();
self.filtered_values_with_nodes(nodes) self.filtered_values_with_nodes(&nodes)
} }
pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> { pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> {

View file

@ -127,23 +127,21 @@ impl K2VRpcHandler {
.item_table .item_table
.data .data
.replication .replication
.write_nodes(&partition.hash()); .storage_nodes(&partition.hash());
who.sort(); who.sort();
self.system self.system
.rpc .rpc_helper()
.try_call_many( .try_call_many(
&self.endpoint, &self.endpoint,
&who[..], &who,
K2VRpc::InsertItem(InsertedItem { K2VRpc::InsertItem(InsertedItem {
partition, partition,
sort_key, sort_key,
causal_context, causal_context,
value, value,
}), }),
RequestStrategy::with_priority(PRIO_NORMAL) RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1),
.with_quorum(1)
.interrupt_after_quorum(true),
) )
.await?; .await?;
@ -168,7 +166,7 @@ impl K2VRpcHandler {
.item_table .item_table
.data .data
.replication .replication
.write_nodes(&partition.hash()); .storage_nodes(&partition.hash());
who.sort(); who.sort();
call_list.entry(who).or_default().push(InsertedItem { call_list.entry(who).or_default().push(InsertedItem {
@ -187,14 +185,12 @@ impl K2VRpcHandler {
let call_futures = call_list.into_iter().map(|(nodes, items)| async move { let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
let resp = self let resp = self
.system .system
.rpc .rpc_helper()
.try_call_many( .try_call_many(
&self.endpoint, &self.endpoint,
&nodes[..], &nodes[..],
K2VRpc::InsertManyItems(items), K2VRpc::InsertManyItems(items),
RequestStrategy::with_priority(PRIO_NORMAL) RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1),
.with_quorum(1)
.interrupt_after_quorum(true),
) )
.await?; .await?;
Ok::<_, Error>((nodes, resp)) Ok::<_, Error>((nodes, resp))
@ -223,15 +219,16 @@ impl K2VRpcHandler {
}, },
sort_key, sort_key,
}; };
// TODO figure this out with write sets, is it still appropriate???
let nodes = self let nodes = self
.item_table .item_table
.data .data
.replication .replication
.write_nodes(&poll_key.partition.hash()); .read_nodes(&poll_key.partition.hash());
let rpc = self.system.rpc.try_call_many( let rpc = self.system.rpc_helper().try_call_many(
&self.endpoint, &self.endpoint,
&nodes[..], &nodes,
K2VRpc::PollItem { K2VRpc::PollItem {
key: poll_key, key: poll_key,
causal_context, causal_context,
@ -239,9 +236,11 @@ impl K2VRpcHandler {
}, },
RequestStrategy::with_priority(PRIO_NORMAL) RequestStrategy::with_priority(PRIO_NORMAL)
.with_quorum(self.item_table.data.replication.read_quorum()) .with_quorum(self.item_table.data.replication.read_quorum())
.send_all_at_once(true)
.without_timeout(), .without_timeout(),
); );
let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); let timeout_duration =
Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout();
let resps = select! { let resps = select! {
r = rpc => r?, r = rpc => r?,
_ = tokio::time::sleep(timeout_duration) => return Ok(None), _ = tokio::time::sleep(timeout_duration) => return Ok(None),
@ -283,11 +282,12 @@ impl K2VRpcHandler {
seen.restrict(&range); seen.restrict(&range);
// Prepare PollRange RPC to send to the storage nodes responsible for the parititon // Prepare PollRange RPC to send to the storage nodes responsible for the parititon
// TODO figure this out with write sets, does it still work????
let nodes = self let nodes = self
.item_table .item_table
.data .data
.replication .replication
.write_nodes(&range.partition.hash()); .read_nodes(&range.partition.hash());
let quorum = self.item_table.data.replication.read_quorum(); let quorum = self.item_table.data.replication.read_quorum();
let msg = K2VRpc::PollRange { let msg = K2VRpc::PollRange {
range, range,
@ -300,7 +300,11 @@ impl K2VRpcHandler {
let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout(); let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout();
let mut requests = nodes let mut requests = nodes
.iter() .iter()
.map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs)) .map(|node| {
self.system
.rpc_helper()
.call(&self.endpoint, *node, msg.clone(), rs)
})
.collect::<FuturesUnordered<_>>(); .collect::<FuturesUnordered<_>>();
// Fetch responses. This procedure stops fetching responses when any of the following // Fetch responses. This procedure stops fetching responses when any of the following
@ -316,8 +320,9 @@ impl K2VRpcHandler {
// kind: all items produced by that node until time ts have been returned, so we can // kind: all items produced by that node until time ts have been returned, so we can
// bump the entry in the global vector clock and possibly remove some item-specific // bump the entry in the global vector clock and possibly remove some item-specific
// vector clocks) // vector clocks)
let mut deadline = let mut deadline = Instant::now()
Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + Duration::from_millis(timeout_msec)
+ self.system.rpc_helper().rpc_timeout();
let mut resps = vec![]; let mut resps = vec![];
let mut errors = vec![]; let mut errors = vec![];
loop { loop {
@ -339,7 +344,7 @@ impl K2VRpcHandler {
} }
if errors.len() > nodes.len() - quorum { if errors.len() > nodes.len() - quorum {
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into()); return Err(Error::Quorum(quorum, None, resps.len(), nodes.len(), errors).into());
} }
// Take all returned items into account to produce the response. // Take all returned items into account to produce the response.

View file

@ -7,48 +7,7 @@ use garage_table::{DeletedFilter, EmptyKey, Entry, TableSchema};
use crate::permission::BucketKeyPerm; use crate::permission::BucketKeyPerm;
pub(crate) mod v05 {
use garage_util::crdt;
use serde::{Deserialize, Serialize};
/// An api key
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Key {
/// The id of the key (immutable), used as partition key
pub key_id: String,
/// The secret_key associated
pub secret_key: String,
/// Name for the key
pub name: crdt::Lww<String>,
/// Is the key deleted
pub deleted: crdt::Bool,
/// Buckets in which the key is authorized. Empty if `Key` is deleted
// CRDT interaction: deleted implies authorized_buckets is empty
pub authorized_buckets: crdt::LwwMap<String, PermissionSet>,
}
/// Permission given to a key in a bucket
#[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct PermissionSet {
/// The key can be used to read the bucket
pub allow_read: bool,
/// The key can be used to write in the bucket
pub allow_write: bool,
}
impl crdt::AutoCrdt for PermissionSet {
const WARN_IF_DIFFERENT: bool = true;
}
impl garage_util::migrate::InitialFormat for Key {}
}
mod v08 { mod v08 {
use super::v05;
use crate::permission::BucketKeyPerm; use crate::permission::BucketKeyPerm;
use garage_util::crdt; use garage_util::crdt;
use garage_util::data::Uuid; use garage_util::data::Uuid;
@ -86,32 +45,7 @@ mod v08 {
pub local_aliases: crdt::LwwMap<String, Option<Uuid>>, pub local_aliases: crdt::LwwMap<String, Option<Uuid>>,
} }
impl garage_util::migrate::Migrate for Key { impl garage_util::migrate::InitialFormat for Key {}
type Previous = v05::Key;
fn migrate(old_k: v05::Key) -> Key {
let name = crdt::Lww::raw(old_k.name.timestamp(), old_k.name.get().clone());
let state = if old_k.deleted.get() {
crdt::Deletable::Deleted
} else {
// Authorized buckets is ignored here,
// migration is performed in specific migration code in
// garage/migrate.rs
crdt::Deletable::Present(KeyParams {
secret_key: old_k.secret_key,
name,
allow_create_bucket: crdt::Lww::new(false),
authorized_buckets: crdt::Map::new(),
local_aliases: crdt::LwwMap::new(),
})
};
Key {
key_id: old_k.key_id,
state,
}
}
}
} }
pub use v08::*; pub use v08::*;

View file

@ -1,9 +1,6 @@
#[macro_use] #[macro_use]
extern crate tracing; extern crate tracing;
// For migration from previous versions
pub(crate) mod prev;
pub mod permission; pub mod permission;
pub mod index_counter; pub mod index_counter;
@ -18,5 +15,4 @@ pub mod s3;
pub mod garage; pub mod garage;
pub mod helper; pub mod helper;
pub mod migrate;
pub mod snapshot; pub mod snapshot;

View file

@ -1,108 +0,0 @@
use std::sync::Arc;
use garage_util::crdt::*;
use garage_util::data::*;
use garage_util::encode::nonversioned_decode;
use garage_util::error::Error as GarageError;
use garage_util::time::*;
use crate::prev::v051::bucket_table as old_bucket;
use crate::bucket_alias_table::*;
use crate::bucket_table::*;
use crate::garage::Garage;
use crate::helper::error::*;
use crate::permission::*;
pub struct Migrate {
pub garage: Arc<Garage>,
}
impl Migrate {
pub async fn migrate_buckets050(&self) -> Result<(), Error> {
let tree = self
.garage
.db
.open_tree("bucket:table")
.map_err(GarageError::from)?;
let mut old_buckets = vec![];
for res in tree.iter().map_err(GarageError::from)? {
let (_k, v) = res.map_err(GarageError::from)?;
let bucket =
nonversioned_decode::<old_bucket::Bucket>(&v[..]).map_err(GarageError::from)?;
old_buckets.push(bucket);
}
for bucket in old_buckets {
if let old_bucket::BucketState::Present(p) = bucket.state.get() {
self.migrate_buckets050_do_bucket(&bucket, p).await?;
}
}
Ok(())
}
pub async fn migrate_buckets050_do_bucket(
&self,
old_bucket: &old_bucket::Bucket,
old_bucket_p: &old_bucket::BucketParams,
) -> Result<(), Error> {
let bucket_id = blake2sum(old_bucket.name.as_bytes());
let new_name = if is_valid_bucket_name(&old_bucket.name) {
old_bucket.name.clone()
} else {
// if old bucket name was not valid, replace it by
// a hex-encoded name derived from its identifier
hex::encode(&bucket_id.as_slice()[..16])
};
let website = if *old_bucket_p.website.get() {
Some(WebsiteConfig {
index_document: "index.html".into(),
error_document: None,
})
} else {
None
};
let helper = self.garage.locked_helper().await;
self.garage
.bucket_table
.insert(&Bucket {
id: bucket_id,
state: Deletable::Present(BucketParams {
creation_date: now_msec(),
authorized_keys: Map::new(),
aliases: LwwMap::new(),
local_aliases: LwwMap::new(),
website_config: Lww::new(website),
cors_config: Lww::new(None),
lifecycle_config: Lww::new(None),
quotas: Lww::new(Default::default()),
}),
})
.await?;
helper.set_global_bucket_alias(bucket_id, &new_name).await?;
for (k, ts, perm) in old_bucket_p.authorized_keys.items().iter() {
helper
.set_bucket_key_permissions(
bucket_id,
k,
BucketKeyPerm {
timestamp: *ts,
allow_read: perm.allow_read,
allow_write: perm.allow_write,
allow_owner: false,
},
)
.await?;
}
Ok(())
}
}

View file

@ -1 +0,0 @@
pub(crate) mod v051;

View file

@ -1,63 +0,0 @@
use serde::{Deserialize, Serialize};
use garage_table::crdt::Crdt;
use garage_table::*;
use crate::key_table::v05::PermissionSet;
/// A bucket is a collection of objects
///
/// Its parameters are not directly accessible as:
/// - It must be possible to merge paramaters, hence the use of a LWW CRDT.
/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present.
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Bucket {
/// Name of the bucket
pub name: String,
/// State, and configuration if not deleted, of the bucket
pub state: crdt::Lww<BucketState>,
}
/// State of a bucket
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub enum BucketState {
/// The bucket is deleted
Deleted,
/// The bucket exists
Present(BucketParams),
}
impl Crdt for BucketState {
fn merge(&mut self, o: &Self) {
match o {
BucketState::Deleted => *self = BucketState::Deleted,
BucketState::Present(other_params) => {
if let BucketState::Present(params) = self {
params.merge(other_params);
}
}
}
}
}
/// Configuration for a bucket
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct BucketParams {
/// Map of key with access to the bucket, and what kind of access they give
pub authorized_keys: crdt::LwwMap<String, PermissionSet>,
/// Is the bucket served as http
pub website: crdt::Lww<bool>,
}
impl Crdt for BucketParams {
fn merge(&mut self, o: &Self) {
self.authorized_keys.merge(&o.authorized_keys);
self.website.merge(&o.website);
}
}
impl Crdt for Bucket {
fn merge(&mut self, other: &Self) {
self.state.merge(&other.state);
}
}

View file

@ -1 +0,0 @@
pub(crate) mod bucket_table;

View file

@ -121,13 +121,7 @@ impl Worker for LifecycleWorker {
mpu_aborted, mpu_aborted,
.. ..
} => { } => {
let n_objects = self let n_objects = self.garage.object_table.data.store.len().ok();
.garage
.object_table
.data
.store
.fast_len()
.unwrap_or(None);
let progress = match n_objects { let progress = match n_objects {
None => "...".to_string(), None => "...".to_string(),
Some(total) => format!( Some(total) => format!(

View file

@ -17,7 +17,7 @@ pub const OBJECTS: &str = "objects";
pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads";
pub const BYTES: &str = "bytes"; pub const BYTES: &str = "bytes";
mod v05 { mod v08 {
use garage_util::data::{Hash, Uuid}; use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::BTreeMap; use std::collections::BTreeMap;
@ -26,7 +26,7 @@ mod v05 {
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Object { pub struct Object {
/// The bucket in which the object is stored, used as partition key /// The bucket in which the object is stored, used as partition key
pub bucket: String, pub bucket_id: Uuid,
/// The key at which the object is stored in its bucket, used as sorting key /// The key at which the object is stored in its bucket, used as sorting key
pub key: String, pub key: String,
@ -92,45 +92,6 @@ mod v05 {
impl garage_util::migrate::InitialFormat for Object {} impl garage_util::migrate::InitialFormat for Object {}
} }
mod v08 {
use garage_util::data::Uuid;
use serde::{Deserialize, Serialize};
use super::v05;
pub use v05::{
ObjectVersion, ObjectVersionData, ObjectVersionHeaders, ObjectVersionMeta,
ObjectVersionState,
};
/// An object
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Object {
/// The bucket in which the object is stored, used as partition key
pub bucket_id: Uuid,
/// The key at which the object is stored in its bucket, used as sorting key
pub key: String,
/// The list of currenty stored versions of the object
pub(super) versions: Vec<ObjectVersion>,
}
impl garage_util::migrate::Migrate for Object {
type Previous = v05::Object;
fn migrate(old: v05::Object) -> Object {
use garage_util::data::blake2sum;
Object {
bucket_id: blake2sum(old.bucket.as_bytes()),
key: old.key,
versions: old.versions,
}
}
}
}
mod v09 { mod v09 {
use garage_util::data::Uuid; use garage_util::data::Uuid;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -210,7 +171,179 @@ mod v09 {
} }
} }
pub use v09::*; mod v010 {
use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize};
use super::v09;
/// An object
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Object {
/// The bucket in which the object is stored, used as partition key
pub bucket_id: Uuid,
/// The key at which the object is stored in its bucket, used as sorting key
pub key: String,
/// The list of currenty stored versions of the object
pub(super) versions: Vec<ObjectVersion>,
}
/// Informations about a version of an object
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersion {
/// Id of the version
pub uuid: Uuid,
/// Timestamp of when the object was created
pub timestamp: u64,
/// State of the version
pub state: ObjectVersionState,
}
/// State of an object version
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub enum ObjectVersionState {
/// The version is being received
Uploading {
/// Indicates whether this is a multipart upload
multipart: bool,
/// Encryption params + headers to be included in the final object
encryption: ObjectVersionEncryption,
},
/// The version is fully received
Complete(ObjectVersionData),
/// The version uploaded containded errors or the upload was explicitly aborted
Aborted,
}
/// Data stored in object version
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub enum ObjectVersionData {
/// The object was deleted, this Version is a tombstone to mark it as such
DeleteMarker,
/// The object is short, it's stored inlined.
/// It is never compressed. For encrypted objects, it is encrypted using
/// AES256-GCM, like the encrypted headers.
Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
/// The object is not short, Hash of first block is stored here, next segments hashes are
/// stored in the version table
FirstBlock(ObjectVersionMeta, Hash),
}
/// Metadata about the object version
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionMeta {
/// Size of the object. If object is encrypted/compressed,
/// this is always the size of the unencrypted/uncompressed data
pub size: u64,
/// etag of the object
pub etag: String,
/// Encryption params + headers (encrypted or plaintext)
pub encryption: ObjectVersionEncryption,
}
/// Encryption information + metadata
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub enum ObjectVersionEncryption {
SseC {
/// Encrypted serialized ObjectVersionHeaders struct.
/// This is never compressed, just encrypted using AES256-GCM.
#[serde(with = "serde_bytes")]
headers: Vec<u8>,
/// Whether data blocks are compressed in addition to being encrypted
/// (compression happens before encryption, whereas for non-encrypted
/// objects, compression is handled at the level of the block manager)
compressed: bool,
},
Plaintext {
/// Plain-text headers
headers: ObjectVersionHeaders,
},
}
/// Vector of headers, as tuples of the format (header name, header value)
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionHeaders(pub Vec<(String, String)>);
impl garage_util::migrate::Migrate for Object {
const VERSION_MARKER: &'static [u8] = b"G010s3ob";
type Previous = v09::Object;
fn migrate(old: v09::Object) -> Object {
Object {
bucket_id: old.bucket_id,
key: old.key,
versions: old.versions.into_iter().map(migrate_version).collect(),
}
}
}
fn migrate_version(old: v09::ObjectVersion) -> ObjectVersion {
ObjectVersion {
uuid: old.uuid,
timestamp: old.timestamp,
state: match old.state {
v09::ObjectVersionState::Uploading { multipart, headers } => {
ObjectVersionState::Uploading {
multipart,
encryption: migrate_headers(headers),
}
}
v09::ObjectVersionState::Complete(d) => {
ObjectVersionState::Complete(migrate_data(d))
}
v09::ObjectVersionState::Aborted => ObjectVersionState::Aborted,
},
}
}
fn migrate_data(old: v09::ObjectVersionData) -> ObjectVersionData {
match old {
v09::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker,
v09::ObjectVersionData::Inline(meta, data) => {
ObjectVersionData::Inline(migrate_meta(meta), data)
}
v09::ObjectVersionData::FirstBlock(meta, fb) => {
ObjectVersionData::FirstBlock(migrate_meta(meta), fb)
}
}
}
fn migrate_meta(old: v09::ObjectVersionMeta) -> ObjectVersionMeta {
ObjectVersionMeta {
size: old.size,
etag: old.etag,
encryption: migrate_headers(old.headers),
}
}
fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption {
use http::header::CONTENT_TYPE;
let mut new_headers = Vec::with_capacity(old.other.len() + 1);
if old.content_type != "blob" {
new_headers.push((CONTENT_TYPE.as_str().to_string(), old.content_type));
}
for (name, value) in old.other.into_iter() {
new_headers.push((name, value));
}
ObjectVersionEncryption::Plaintext {
headers: ObjectVersionHeaders(new_headers),
}
}
// Since ObjectVersionHeaders can now be serialized independently, for the
// purpose of being encrypted, we need it to support migrations on its own
// as well.
impl garage_util::migrate::InitialFormat for ObjectVersionHeaders {
const VERSION_MARKER: &'static [u8] = b"G010s3oh";
}
}
pub use v010::*;
impl Object { impl Object {
/// Initialize an Object struct from parts /// Initialize an Object struct from parts

View file

@ -11,64 +11,11 @@ use garage_table::*;
use crate::s3::block_ref_table::*; use crate::s3::block_ref_table::*;
mod v05 { mod v08 {
use garage_util::crdt; use garage_util::crdt;
use garage_util::data::{Hash, Uuid}; use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
/// A version of an object
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Version {
/// UUID of the version, used as partition key
pub uuid: Uuid,
// Actual data: the blocks for this version
// In the case of a multipart upload, also store the etags
// of individual parts and check them when doing CompleteMultipartUpload
/// Is this version deleted
pub deleted: crdt::Bool,
/// list of blocks of data composing the version
pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
/// Etag of each part in case of a multipart upload, empty otherwise
pub parts_etags: crdt::Map<u64, String>,
// Back link to bucket+key so that we can figure if
// this was deleted later on
/// Bucket in which the related object is stored
pub bucket: String,
/// Key in which the related object is stored
pub key: String,
}
#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlockKey {
/// Number of the part
pub part_number: u64,
/// Offset of this sub-segment in its part
pub offset: u64,
}
/// Informations about a single block
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlock {
/// Blake2 sum of the block
pub hash: Hash,
/// Size of the block
pub size: u64,
}
impl garage_util::migrate::InitialFormat for Version {}
}
mod v08 {
use garage_util::crdt;
use garage_util::data::Uuid;
use serde::{Deserialize, Serialize};
use super::v05;
pub use v05::{VersionBlock, VersionBlockKey};
/// A version of an object /// A version of an object
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Version { pub struct Version {
@ -93,22 +40,25 @@ mod v08 {
pub key: String, pub key: String,
} }
impl garage_util::migrate::Migrate for Version { #[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
type Previous = v05::Version; pub struct VersionBlockKey {
/// Number of the part
fn migrate(old: v05::Version) -> Version { pub part_number: u64,
use garage_util::data::blake2sum; /// Offset of this sub-segment in its part as sent by the client
/// (before any kind of compression or encryption)
Version { pub offset: u64,
uuid: old.uuid,
deleted: old.deleted,
blocks: old.blocks,
parts_etags: old.parts_etags,
bucket_id: blake2sum(old.bucket.as_bytes()),
key: old.key,
}
}
} }
/// Informations about a single block
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlock {
/// Blake2 sum of the block
pub hash: Hash,
/// Size of the block, before any kind of compression or encryption
pub size: u64,
}
impl garage_util::migrate::InitialFormat for Version {}
} }
pub(crate) mod v09 { pub(crate) mod v09 {

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_net" name = "garage_net"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_rpc" name = "garage_rpc"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"

View file

@ -114,16 +114,6 @@ impl Graph<FlowEdge> {
Ok(result) Ok(result)
} }
/// This function returns the value of the flow incoming to v.
pub fn get_inflow(&self, v: Vertex) -> Result<i64, String> {
let idv = self.get_vertex_id(&v)?;
let mut result = 0;
for edge in self.graph[idv].iter() {
result += max(0, self.graph[edge.dest][edge.rev].flow);
}
Ok(result)
}
/// This function returns the value of the flow outgoing from v. /// This function returns the value of the flow outgoing from v.
pub fn get_outflow(&self, v: Vertex) -> Result<i64, String> { pub fn get_outflow(&self, v: Vertex) -> Result<i64, String> {
let idv = self.get_vertex_id(&v)?; let idv = self.get_vertex_id(&v)?;

298
src/rpc/layout/helper.rs Normal file
View file

@ -0,0 +1,298 @@
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::atomic::{AtomicUsize, Ordering};
use serde::{Deserialize, Serialize};
use garage_util::data::*;
use super::*;
use crate::replication_mode::*;
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
pub struct RpcLayoutDigest {
/// Cluster layout version
pub current_version: u64,
/// Number of active layout versions
pub active_versions: usize,
/// Hash of cluster layout update trackers
pub trackers_hash: Hash,
/// Hash of cluster layout staging data
pub staging_hash: Hash,
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub struct SyncLayoutDigest {
current: u64,
ack_map_min: u64,
min_stored: u64,
}
pub struct LayoutHelper {
replication_factor: ReplicationFactor,
consistency_mode: ConsistencyMode,
layout: Option<LayoutHistory>,
// cached values
ack_map_min: u64,
sync_map_min: u64,
all_nodes: Vec<Uuid>,
all_nongateway_nodes: Vec<Uuid>,
trackers_hash: Hash,
staging_hash: Hash,
// ack lock: counts in-progress write operations for each
// layout version ; we don't increase the ack update tracker
// while this lock is nonzero
pub(crate) ack_lock: HashMap<u64, AtomicUsize>,
}
impl Deref for LayoutHelper {
type Target = LayoutHistory;
fn deref(&self) -> &LayoutHistory {
self.layout()
}
}
impl LayoutHelper {
pub fn new(
replication_factor: ReplicationFactor,
consistency_mode: ConsistencyMode,
mut layout: LayoutHistory,
mut ack_lock: HashMap<u64, AtomicUsize>,
) -> Self {
// In the new() function of the helper, we do a bunch of cleanup
// and calculations on the layout history to make sure things are
// correct and we have rapid access to important values such as
// the layout versions to use when reading to ensure consistency.
if consistency_mode != ConsistencyMode::Consistent {
// Fast path for when no consistency is required.
// In this case we only need to keep the last version of the layout,
// we don't care about coordinating stuff in the cluster.
layout.keep_current_version_only();
}
layout.cleanup_old_versions();
let all_nodes = layout.get_all_nodes();
let all_nongateway_nodes = layout.get_all_nongateway_nodes();
layout.clamp_update_trackers(&all_nodes);
let min_version = layout.min_stored();
// ack_map_min is the minimum value of ack_map among all nodes
// in the cluster (gateway, non-gateway, current and previous layouts).
// It is the highest layout version which all of these nodes have
// acknowledged, indicating that they are aware of it and are no
// longer processing write operations that did not take it into account.
let ack_map_min = layout
.update_trackers
.ack_map
.min_among(&all_nodes, min_version);
// sync_map_min is the minimum value of sync_map among storage nodes
// in the cluster (non-gateway nodes only, current and previous layouts).
// It is the highest layout version for which we know that all relevant
// storage nodes have fullfilled a sync, and therefore it is safe to
// use a read quorum within that layout to ensure consistency.
// Gateway nodes are excluded here because they hold no relevant data
// (they store the bucket and access key tables, but we don't have
// consistency on those).
// This value is calculated using quorums to allow progress even
// if not all nodes have successfully completed a sync.
let sync_map_min =
layout.calculate_sync_map_min_with_quorum(replication_factor, &all_nongateway_nodes);
let trackers_hash = layout.calculate_trackers_hash();
let staging_hash = layout.calculate_staging_hash();
ack_lock.retain(|_, cnt| *cnt.get_mut() > 0);
ack_lock
.entry(layout.current().version)
.or_insert(AtomicUsize::new(0));
LayoutHelper {
replication_factor,
consistency_mode,
layout: Some(layout),
ack_map_min,
sync_map_min,
all_nodes,
all_nongateway_nodes,
trackers_hash,
staging_hash,
ack_lock,
}
}
// ------------------ single updating function --------------
fn layout(&self) -> &LayoutHistory {
self.layout.as_ref().unwrap()
}
pub(crate) fn update<F>(&mut self, f: F) -> bool
where
F: FnOnce(&mut LayoutHistory) -> bool,
{
let changed = f(self.layout.as_mut().unwrap());
if changed {
*self = Self::new(
self.replication_factor,
self.consistency_mode,
self.layout.take().unwrap(),
std::mem::take(&mut self.ack_lock),
);
}
changed
}
// ------------------ read helpers ---------------
pub fn all_nodes(&self) -> &[Uuid] {
&self.all_nodes
}
pub fn all_nongateway_nodes(&self) -> &[Uuid] {
&self.all_nongateway_nodes
}
pub fn ack_map_min(&self) -> u64 {
self.ack_map_min
}
pub fn sync_map_min(&self) -> u64 {
self.sync_map_min
}
pub fn sync_digest(&self) -> SyncLayoutDigest {
SyncLayoutDigest {
current: self.layout().current().version,
ack_map_min: self.ack_map_min(),
min_stored: self.layout().min_stored(),
}
}
pub fn read_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
let sync_min = self.sync_map_min;
let version = self
.layout()
.versions
.iter()
.find(|x| x.version == sync_min)
.or(self.layout().versions.last())
.unwrap();
version
.nodes_of(position, version.replication_factor)
.collect()
}
pub fn storage_sets_of(&self, position: &Hash) -> Vec<Vec<Uuid>> {
self.layout()
.versions
.iter()
.map(|x| x.nodes_of(position, x.replication_factor).collect())
.collect()
}
pub fn storage_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
let mut ret = vec![];
for version in self.layout().versions.iter() {
ret.extend(version.nodes_of(position, version.replication_factor));
}
ret.sort();
ret.dedup();
ret
}
pub fn trackers_hash(&self) -> Hash {
self.trackers_hash
}
pub fn staging_hash(&self) -> Hash {
self.staging_hash
}
pub fn digest(&self) -> RpcLayoutDigest {
RpcLayoutDigest {
current_version: self.current().version,
active_versions: self.versions.len(),
trackers_hash: self.trackers_hash,
staging_hash: self.staging_hash,
}
}
// ------------------ helpers for update tracking ---------------
pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) {
// Ensure trackers for this node's values are up-to-date
// 1. Acknowledge the last layout version which is not currently
// locked by an in-progress write operation
self.ack_max_free(local_node_id);
// 2. Assume the data on this node is sync'ed up at least to
// the first layout version in the history
self.sync_first(local_node_id);
// 3. Acknowledge everyone has synced up to min(self.sync_map)
self.sync_ack(local_node_id);
debug!("ack_map: {:?}", self.update_trackers.ack_map);
debug!("sync_map: {:?}", self.update_trackers.sync_map);
debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map);
}
fn sync_first(&mut self, local_node_id: Uuid) {
let first_version = self.min_stored();
self.update(|layout| {
layout
.update_trackers
.sync_map
.set_max(local_node_id, first_version)
});
}
fn sync_ack(&mut self, local_node_id: Uuid) {
let sync_map_min = self.sync_map_min;
self.update(|layout| {
layout
.update_trackers
.sync_ack_map
.set_max(local_node_id, sync_map_min)
});
}
pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool {
let max_ack = self.max_free_ack();
let changed = self.update(|layout| {
layout
.update_trackers
.ack_map
.set_max(local_node_id, max_ack)
});
if changed {
info!("ack_until updated to {}", max_ack);
}
changed
}
pub(crate) fn max_free_ack(&self) -> u64 {
self.layout()
.versions
.iter()
.map(|x| x.version)
.skip_while(|v| {
self.ack_lock
.get(v)
.map(|x| x.load(Ordering::Relaxed) == 0)
.unwrap_or(true)
})
.next()
.unwrap_or(self.current().version)
}
}

306
src/rpc/layout/history.rs Normal file
View file

@ -0,0 +1,306 @@
use std::collections::HashSet;
use garage_util::crdt::{Crdt, Lww, LwwMap};
use garage_util::data::*;
use garage_util::encode::nonversioned_encode;
use garage_util::error::*;
use super::*;
use crate::replication_mode::*;
impl LayoutHistory {
pub fn new(replication_factor: ReplicationFactor) -> Self {
let version = LayoutVersion::new(replication_factor.into());
let staging = LayoutStaging {
parameters: Lww::<LayoutParameters>::new(version.parameters),
roles: LwwMap::new(),
};
LayoutHistory {
versions: vec![version],
old_versions: vec![],
update_trackers: Default::default(),
staging: Lww::raw(0, staging),
}
}
// ------------------ who stores what now? ---------------
pub fn current(&self) -> &LayoutVersion {
self.versions.last().as_ref().unwrap()
}
pub fn min_stored(&self) -> u64 {
self.versions.first().as_ref().unwrap().version
}
pub fn get_all_nodes(&self) -> Vec<Uuid> {
if self.versions.len() == 1 {
self.versions[0].all_nodes().to_vec()
} else {
let set = self
.versions
.iter()
.flat_map(|x| x.all_nodes())
.collect::<HashSet<_>>();
set.into_iter().copied().collect::<Vec<_>>()
}
}
pub(crate) fn get_all_nongateway_nodes(&self) -> Vec<Uuid> {
if self.versions.len() == 1 {
self.versions[0].nongateway_nodes().to_vec()
} else {
let set = self
.versions
.iter()
.flat_map(|x| x.nongateway_nodes())
.collect::<HashSet<_>>();
set.into_iter().copied().collect::<Vec<_>>()
}
}
// ---- housekeeping (all invoked by LayoutHelper) ----
pub(crate) fn keep_current_version_only(&mut self) {
while self.versions.len() > 1 {
let removed = self.versions.remove(0);
self.old_versions.push(removed);
}
}
pub(crate) fn cleanup_old_versions(&mut self) {
// If there are invalid versions before valid versions, remove them
if self.versions.len() > 1 && self.current().check().is_ok() {
while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() {
let removed = self.versions.remove(0);
info!(
"Layout history: pruning old invalid version {}",
removed.version
);
}
}
// If there are old versions that no one is reading from anymore,
// remove them (keep them in self.old_versions).
// ASSUMPTION: we only care about where nodes in the current layout version
// are reading from, as we assume older nodes are being discarded.
let current_nodes = &self.current().node_id_vec;
let min_version = self.min_stored();
let sync_ack_map_min = self
.update_trackers
.sync_ack_map
.min_among(current_nodes, min_version);
while self.min_stored() < sync_ack_map_min {
assert!(self.versions.len() > 1);
let removed = self.versions.remove(0);
info!(
"Layout history: moving version {} to old_versions",
removed.version
);
self.old_versions.push(removed);
}
while self.old_versions.len() > OLD_VERSION_COUNT {
let removed = self.old_versions.remove(0);
info!("Layout history: removing old_version {}", removed.version);
}
}
pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) {
let min_v = self.min_stored();
for node in nodes {
self.update_trackers.ack_map.set_max(*node, min_v);
self.update_trackers.sync_map.set_max(*node, min_v);
self.update_trackers.sync_ack_map.set_max(*node, min_v);
}
}
pub(crate) fn calculate_sync_map_min_with_quorum(
&self,
replication_factor: ReplicationFactor,
all_nongateway_nodes: &[Uuid],
) -> u64 {
// This function calculates the minimum layout version from which
// it is safe to read if we want to maintain read-after-write consistency.
// In the general case the computation can be a bit expensive so
// we try to optimize it in several ways.
// If there is only one layout version, we know that's the one
// we need to read from.
if self.versions.len() == 1 {
return self.current().version;
}
let quorum = replication_factor.write_quorum(ConsistencyMode::Consistent);
let min_version = self.min_stored();
let global_min = self
.update_trackers
.sync_map
.min_among(all_nongateway_nodes, min_version);
// If the write quorums are equal to the total number of nodes,
// i.e. no writes can succeed while they are not written to all nodes,
// then we must in all case wait for all nodes to complete a sync.
// This is represented by reading from the layout with version
// number global_min, the smallest layout version for which all nodes
// have completed a sync.
if quorum == self.current().replication_factor {
return global_min;
}
// In the general case, we need to look at all write sets for all partitions,
// and find a safe layout version to read for that partition. We then
// take the minimum value among all partition as the safe layout version
// to read in all cases (the layout version to which all reads are directed).
let mut current_min = self.current().version;
let mut sets_done = HashSet::<Vec<Uuid>>::new();
for (_, p_hash) in self.current().partitions() {
for v in self.versions.iter() {
if v.version == self.current().version {
// We don't care about whether nodes in the latest layout version
// have completed a sync or not, as the sync is push-only
// and by definition nodes in the latest layout version do not
// hold data that must be pushed to nodes in the latest layout
// version, since that's the same version (any data that's
// already in the latest version is assumed to have been written
// by an operation that ensured a quorum of writes within
// that version).
continue;
}
// Determine set of nodes for partition p in layout version v.
// Sort the node set to avoid duplicate computations.
let mut set = v
.nodes_of(&p_hash, v.replication_factor)
.collect::<Vec<Uuid>>();
set.sort();
// If this set was already processed, skip it.
if sets_done.contains(&set) {
continue;
}
// Find the value of the sync update trackers that is the
// highest possible minimum within a quorum of nodes.
let mut sync_values = set
.iter()
.map(|x| self.update_trackers.sync_map.get(x, min_version))
.collect::<Vec<_>>();
sync_values.sort();
let set_min = sync_values[sync_values.len() - quorum];
if set_min < current_min {
current_min = set_min;
}
// defavorable case, we know we are at the smallest possible version,
// so we can stop early
assert!(current_min >= global_min);
if current_min == global_min {
return current_min;
}
// Add set to already processed sets
sets_done.insert(set);
}
}
current_min
}
pub(crate) fn calculate_trackers_hash(&self) -> Hash {
blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..])
}
pub(crate) fn calculate_staging_hash(&self) -> Hash {
blake2sum(&nonversioned_encode(&self.staging).unwrap()[..])
}
// ================== updates to layout, public interface ===================
pub fn merge(&mut self, other: &LayoutHistory) -> bool {
let mut changed = false;
// Add any new versions to history
for v2 in other.versions.iter() {
if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) {
// Version is already present, check consistency
if v1 != v2 {
error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version);
}
} else if self.versions.iter().all(|v| v.version != v2.version - 1) {
error!(
"Cannot receive new layout version {}, version {} is missing",
v2.version,
v2.version - 1
);
} else {
self.versions.push(v2.clone());
changed = true;
}
}
// Merge trackers
let c = self.update_trackers.merge(&other.update_trackers);
changed = changed || c;
// Merge staged layout changes
if self.staging != other.staging {
let prev_staging = self.staging.clone();
self.staging.merge(&other.staging);
changed = changed || self.staging != prev_staging;
}
changed
}
pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> {
match version {
None => {
let error = r#"
Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
"#;
return Err(Error::Message(error.into()));
}
Some(v) => {
if v != self.current().version + 1 {
return Err(Error::Message("Invalid new layout version".into()));
}
}
}
// Compute new version and add it to history
let (new_version, msg) = self
.current()
.clone()
.calculate_next_version(self.staging.get())?;
self.versions.push(new_version);
self.cleanup_old_versions();
// Reset the staged layout changes
self.staging.update(LayoutStaging {
parameters: self.staging.get().parameters.clone(),
roles: LwwMap::new(),
});
Ok((self, msg))
}
pub fn revert_staged_changes(mut self) -> Result<Self, Error> {
self.staging.update(LayoutStaging {
parameters: Lww::new(self.current().parameters),
roles: LwwMap::new(),
});
Ok(self)
}
pub fn check(&self) -> Result<(), String> {
// TODO: anything more ?
self.current().check()
}
}

381
src/rpc/layout/manager.rs Normal file
View file

@ -0,0 +1,381 @@
use std::collections::HashMap;
use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard};
use std::time::Duration;
use tokio::sync::Notify;
use garage_net::endpoint::Endpoint;
use garage_net::peering::PeeringManager;
use garage_net::NodeID;
use garage_util::config::Config;
use garage_util::data::*;
use garage_util::error::*;
use garage_util::persister::Persister;
use super::*;
use crate::replication_mode::*;
use crate::rpc_helper::*;
use crate::system::*;
pub struct LayoutManager {
node_id: Uuid,
replication_factor: ReplicationFactor,
persist_cluster_layout: Persister<LayoutHistory>,
layout: Arc<RwLock<LayoutHelper>>,
pub(crate) change_notify: Arc<Notify>,
table_sync_version: Mutex<HashMap<String, u64>>,
pub(crate) rpc_helper: RpcHelper,
system_endpoint: Arc<Endpoint<SystemRpc, System>>,
}
impl LayoutManager {
pub fn new(
config: &Config,
node_id: NodeID,
system_endpoint: Arc<Endpoint<SystemRpc, System>>,
peering: Arc<PeeringManager>,
replication_factor: ReplicationFactor,
consistency_mode: ConsistencyMode,
) -> Result<Arc<Self>, Error> {
let persist_cluster_layout: Persister<LayoutHistory> =
Persister::new(&config.metadata_dir, "cluster_layout");
let cluster_layout = match persist_cluster_layout.load() {
Ok(x) => {
if x.current().replication_factor != replication_factor.replication_factor() {
return Err(Error::Message(format!(
"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
x.current().replication_factor,
replication_factor.replication_factor()
)));
}
x
}
Err(e) => {
info!(
"No valid previous cluster layout stored ({}), starting fresh.",
e
);
LayoutHistory::new(replication_factor)
}
};
let mut cluster_layout = LayoutHelper::new(
replication_factor,
consistency_mode,
cluster_layout,
Default::default(),
);
cluster_layout.update_trackers(node_id.into());
let layout = Arc::new(RwLock::new(cluster_layout));
let change_notify = Arc::new(Notify::new());
let rpc_helper = RpcHelper::new(
node_id.into(),
peering,
layout.clone(),
config.rpc_timeout_msec.map(Duration::from_millis),
);
Ok(Arc::new(Self {
node_id: node_id.into(),
replication_factor,
persist_cluster_layout,
layout,
change_notify,
table_sync_version: Mutex::new(HashMap::new()),
system_endpoint,
rpc_helper,
}))
}
// ---- PUBLIC INTERFACE ----
pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHelper> {
self.layout.read().unwrap()
}
pub async fn update_cluster_layout(
self: &Arc<Self>,
layout: &LayoutHistory,
) -> Result<(), Error> {
self.handle_advertise_cluster_layout(layout).await?;
Ok(())
}
pub fn add_table(&self, table_name: &'static str) {
let first_version = self.layout().versions.first().unwrap().version;
self.table_sync_version
.lock()
.unwrap()
.insert(table_name.to_string(), first_version);
}
pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) {
let mut table_sync_version = self.table_sync_version.lock().unwrap();
*table_sync_version.get_mut(table_name).unwrap() = version;
let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap();
drop(table_sync_version);
let mut layout = self.layout.write().unwrap();
if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) {
info!("sync_until updated to {}", sync_until);
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
layout.update_trackers.clone(),
));
}
}
fn ack_new_version(self: &Arc<Self>) {
let mut layout = self.layout.write().unwrap();
if layout.ack_max_free(self.node_id) {
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
layout.update_trackers.clone(),
));
}
}
// ---- ACK LOCKING ----
pub fn write_sets_of(self: &Arc<Self>, position: &Hash) -> WriteLock<Vec<Vec<Uuid>>> {
let layout = self.layout();
let version = layout.current().version;
let nodes = layout.storage_sets_of(position);
layout
.ack_lock
.get(&version)
.unwrap()
.fetch_add(1, Ordering::Relaxed);
WriteLock::new(version, self, nodes)
}
// ---- INTERNALS ---
fn merge_layout(&self, adv: &LayoutHistory) -> Option<LayoutHistory> {
let mut layout = self.layout.write().unwrap();
let prev_digest = layout.digest();
let prev_layout_check = layout.check().is_ok();
if !prev_layout_check || adv.check().is_ok() {
if layout.update(|l| l.merge(adv)) {
layout.update_trackers(self.node_id);
if prev_layout_check && layout.check().is_err() {
panic!("Merged two correct layouts and got an incorrect layout.");
}
assert!(layout.digest() != prev_digest);
return Some(layout.clone());
}
}
None
}
fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option<UpdateTrackers> {
let mut layout = self.layout.write().unwrap();
let prev_digest = layout.digest();
if layout.update_trackers != *adv {
if layout.update(|l| l.update_trackers.merge(adv)) {
layout.update_trackers(self.node_id);
assert!(layout.digest() != prev_digest);
return Some(layout.update_trackers.clone());
}
}
None
}
async fn pull_cluster_layout(self: &Arc<Self>, peer: Uuid) {
let resp = self
.rpc_helper
.call(
&self.system_endpoint,
peer,
SystemRpc::PullClusterLayout,
RequestStrategy::with_priority(PRIO_HIGH),
)
.await;
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
if let Err(e) = self.handle_advertise_cluster_layout(&layout).await {
warn!("In pull_cluster_layout: {}", e);
}
}
}
async fn pull_cluster_layout_trackers(self: &Arc<Self>, peer: Uuid) {
let resp = self
.rpc_helper
.call(
&self.system_endpoint,
peer,
SystemRpc::PullClusterLayoutTrackers,
RequestStrategy::with_priority(PRIO_HIGH),
)
.await;
if let Ok(SystemRpc::AdvertiseClusterLayoutTrackers(trackers)) = resp {
if let Err(e) = self
.handle_advertise_cluster_layout_trackers(&trackers)
.await
{
warn!("In pull_cluster_layout_trackers: {}", e);
}
}
}
/// Save cluster layout data to disk
async fn save_cluster_layout(&self) -> Result<(), Error> {
let layout = self.layout.read().unwrap().clone();
self.persist_cluster_layout
.save_async(&layout)
.await
.expect("Cannot save current cluster layout");
Ok(())
}
fn broadcast_update(self: &Arc<Self>, rpc: SystemRpc) {
tokio::spawn({
let this = self.clone();
async move {
if let Err(e) = this
.rpc_helper
.broadcast(
&this.system_endpoint,
rpc,
RequestStrategy::with_priority(PRIO_HIGH),
)
.await
{
warn!("Error while broadcasting new cluster layout: {}", e);
}
}
});
}
// ---- RPC HANDLERS ----
pub(crate) fn handle_advertise_status(self: &Arc<Self>, from: Uuid, remote: &RpcLayoutDigest) {
let local = self.layout().digest();
if remote.current_version > local.current_version
|| remote.active_versions != local.active_versions
|| remote.staging_hash != local.staging_hash
{
tokio::spawn({
let this = self.clone();
async move { this.pull_cluster_layout(from).await }
});
} else if remote.trackers_hash != local.trackers_hash {
tokio::spawn({
let this = self.clone();
async move { this.pull_cluster_layout_trackers(from).await }
});
}
}
pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc {
let layout = self.layout.read().unwrap().clone();
SystemRpc::AdvertiseClusterLayout(layout)
}
pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc {
let layout = self.layout.read().unwrap();
SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone())
}
pub(crate) async fn handle_advertise_cluster_layout(
self: &Arc<Self>,
adv: &LayoutHistory,
) -> Result<SystemRpc, Error> {
debug!(
"handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}",
adv.versions.len(),
adv.current().version,
adv.update_trackers
);
if adv.current().replication_factor != self.replication_factor.replication_factor() {
let msg = format!(
"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
adv.current().replication_factor,
self.replication_factor.replication_factor()
);
error!("{}", msg);
return Err(Error::Message(msg));
}
if let Some(new_layout) = self.merge_layout(adv) {
debug!("handle_advertise_cluster_layout: some changes were added to the current stuff");
self.change_notify.notify_waiters();
self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout));
self.save_cluster_layout().await?;
}
Ok(SystemRpc::Ok)
}
pub(crate) async fn handle_advertise_cluster_layout_trackers(
self: &Arc<Self>,
trackers: &UpdateTrackers,
) -> Result<SystemRpc, Error> {
debug!("handle_advertise_cluster_layout_trackers: {:?}", trackers);
if let Some(new_trackers) = self.merge_layout_trackers(trackers) {
self.change_notify.notify_waiters();
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers));
self.save_cluster_layout().await?;
}
Ok(SystemRpc::Ok)
}
}
// ---- ack lock ----
pub struct WriteLock<T> {
layout_version: u64,
layout_manager: Arc<LayoutManager>,
value: T,
}
impl<T> WriteLock<T> {
fn new(version: u64, layout_manager: &Arc<LayoutManager>, value: T) -> Self {
Self {
layout_version: version,
layout_manager: layout_manager.clone(),
value,
}
}
}
impl<T> AsRef<T> for WriteLock<T> {
fn as_ref(&self) -> &T {
&self.value
}
}
impl<T> AsMut<T> for WriteLock<T> {
fn as_mut(&mut self) -> &mut T {
&mut self.value
}
}
impl<T> Drop for WriteLock<T> {
fn drop(&mut self) {
let layout = self.layout_manager.layout(); // acquire read lock
if let Some(counter) = layout.ack_lock.get(&self.layout_version) {
let prev_lock = counter.fetch_sub(1, Ordering::Relaxed);
if prev_lock == 1 && layout.current().version > self.layout_version {
drop(layout); // release read lock, write lock will be acquired
self.layout_manager.ack_new_version();
}
} else {
error!("Could not find ack lock counter for layout version {}. This probably indicates a bug in Garage.", self.layout_version);
}
}
}

478
src/rpc/layout/mod.rs Normal file
View file

@ -0,0 +1,478 @@
use std::fmt;
use bytesize::ByteSize;
use garage_util::crdt::{AutoCrdt, Crdt};
use garage_util::data::Uuid;
mod graph_algo;
mod helper;
mod history;
mod version;
#[cfg(test)]
mod test;
pub mod manager;
// ---- re-exports ----
pub use helper::{LayoutHelper, RpcLayoutDigest, SyncLayoutDigest};
pub use manager::WriteLock;
pub use version::*;
// ---- defines: partitions ----
/// A partition id, which is stored on 16 bits
/// i.e. we have up to 2**16 partitions.
/// (in practice we have exactly 2**PARTITION_BITS partitions)
pub type Partition = u16;
// TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump
// it up to 10.
// Maximum value : 16
/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
/// presence of numerous nodes, but exponentially bigger ring. Max 16
pub const PARTITION_BITS: usize = 8;
const NB_PARTITIONS: usize = 1usize << PARTITION_BITS;
// ---- defines: nodes ----
// Type to store compactly the id of a node in the system
// Change this to u16 the day we want to have more than 256 nodes in a cluster
pub type CompactNodeType = u8;
pub const MAX_NODE_NUMBER: usize = 256;
// ======== actual data structures for the layout data ========
// ======== that is persisted to disk ========
// some small utility impls are at the end of this file,
// but most of the code that actually computes stuff is in
// version.rs, history.rs and helper.rs
mod v08 {
use crate::layout::CompactNodeType;
use garage_util::crdt::LwwMap;
use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize};
/// The layout of the cluster, i.e. the list of roles
/// which are assigned to each cluster node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ClusterLayout {
pub version: u64,
pub replication_factor: usize,
pub roles: LwwMap<Uuid, NodeRoleV>,
// see comments in v010::ClusterLayout
pub node_id_vec: Vec<Uuid>,
#[serde(with = "serde_bytes")]
pub ring_assignation_data: Vec<CompactNodeType>,
/// Role changes which are staged for the next version of the layout
pub staging: LwwMap<Uuid, NodeRoleV>,
pub staging_hash: Hash,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct NodeRoleV(pub Option<NodeRole>);
/// The user-assigned roles of cluster nodes
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct NodeRole {
/// Datacenter at which this entry belong. This information is used to
/// perform a better geodistribution
pub zone: String,
/// The capacity of the node
/// If this is set to None, the node does not participate in storing data for the system
/// and is only active as an API gateway to other nodes
pub capacity: Option<u64>,
/// A set of tags to recognize the node
pub tags: Vec<String>,
}
impl garage_util::migrate::InitialFormat for ClusterLayout {}
}
mod v09 {
use super::v08;
use crate::layout::CompactNodeType;
use garage_util::crdt::{Lww, LwwMap};
use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize};
pub use v08::{NodeRole, NodeRoleV};
/// The layout of the cluster, i.e. the list of roles
/// which are assigned to each cluster node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ClusterLayout {
pub version: u64,
pub replication_factor: usize,
/// This attribute is only used to retain the previously computed partition size,
/// to know to what extent does it change with the layout update.
pub partition_size: u64,
/// Parameters used to compute the assignment currently given by
/// ring_assignment_data
pub parameters: LayoutParameters,
pub roles: LwwMap<Uuid, NodeRoleV>,
// see comments in v010::ClusterLayout
pub node_id_vec: Vec<Uuid>,
#[serde(with = "serde_bytes")]
pub ring_assignment_data: Vec<CompactNodeType>,
/// Parameters to be used in the next partition assignment computation.
pub staging_parameters: Lww<LayoutParameters>,
/// Role changes which are staged for the next version of the layout
pub staging_roles: LwwMap<Uuid, NodeRoleV>,
pub staging_hash: Hash,
}
/// This struct is used to set the parameters to be used in the assignment computation
/// algorithm. It is stored as a Crdt.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct LayoutParameters {
pub zone_redundancy: ZoneRedundancy,
}
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
/// of each partition on at least that number of different zones.
/// Otherwise, copies will be stored on the maximum possible number of zones.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub enum ZoneRedundancy {
AtLeast(usize),
Maximum,
}
impl garage_util::migrate::Migrate for ClusterLayout {
const VERSION_MARKER: &'static [u8] = b"G09layout";
type Previous = v08::ClusterLayout;
fn migrate(previous: Self::Previous) -> Self {
use itertools::Itertools;
// In the old layout, capacities are in an arbitrary unit,
// but in the new layout they are in bytes.
// Here we arbitrarily multiply everything by 1G,
// such that 1 old capacity unit = 1GB in the new units.
// This is totally arbitrary and won't work for most users.
let cap_mul = 1024 * 1024 * 1024;
let roles = multiply_all_capacities(previous.roles, cap_mul);
let staging_roles = multiply_all_capacities(previous.staging, cap_mul);
let node_id_vec = previous.node_id_vec;
// Determine partition size
let mut tmp = previous.ring_assignation_data.clone();
tmp.sort();
let partition_size = tmp
.into_iter()
.dedup_with_count()
.map(|(npart, node)| {
roles
.get(&node_id_vec[node as usize])
.and_then(|p| p.0.as_ref().and_then(|r| r.capacity))
.unwrap_or(0) / npart as u64
})
.min()
.unwrap_or(0);
// By default, zone_redundancy is maximum possible value
let parameters = LayoutParameters {
zone_redundancy: ZoneRedundancy::Maximum,
};
Self {
version: previous.version,
replication_factor: previous.replication_factor,
partition_size,
parameters,
roles,
node_id_vec,
ring_assignment_data: previous.ring_assignation_data,
staging_parameters: Lww::new(parameters),
staging_roles,
staging_hash: [0u8; 32].into(), // will be set in the next migration
}
}
}
fn multiply_all_capacities(
old_roles: LwwMap<Uuid, NodeRoleV>,
mul: u64,
) -> LwwMap<Uuid, NodeRoleV> {
let mut new_roles = LwwMap::new();
for (node, ts, role) in old_roles.items() {
let mut role = role.clone();
if let NodeRoleV(Some(NodeRole {
capacity: Some(ref mut cap),
..
})) = role
{
*cap *= mul;
}
new_roles.merge_raw(node, *ts, &role);
}
new_roles
}
}
mod v010 {
use super::v09;
use crate::layout::CompactNodeType;
use garage_util::crdt::{Lww, LwwMap};
use garage_util::data::Uuid;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy};
/// Number of old (non-live) versions to keep, see LayoutHistory::old_versions
pub const OLD_VERSION_COUNT: usize = 5;
/// The history of cluster layouts, with trackers to keep a record
/// of which nodes are up-to-date to current cluster data
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LayoutHistory {
/// The versions currently in use in the cluster
pub versions: Vec<LayoutVersion>,
/// At most 5 of the previous versions, not used by the garage_table
/// module, but usefull for the garage_block module to find data blocks
/// that have not yet been moved
pub old_versions: Vec<LayoutVersion>,
/// Update trackers
pub update_trackers: UpdateTrackers,
/// Staged changes for the next version
pub staging: Lww<LayoutStaging>,
}
/// A version of the layout of the cluster, i.e. the list of roles
/// which are assigned to each cluster node
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LayoutVersion {
/// The number of this version
pub version: u64,
/// Roles assigned to nodes in this version
pub roles: LwwMap<Uuid, NodeRoleV>,
/// Parameters used to compute the assignment currently given by
/// ring_assignment_data
pub parameters: LayoutParameters,
/// The number of replicas for each data partition
pub replication_factor: usize,
/// This attribute is only used to retain the previously computed partition size,
/// to know to what extent does it change with the layout update.
pub partition_size: u64,
/// node_id_vec: a vector of node IDs with a role assigned
/// in the system (this includes gateway nodes).
/// The order here is different than the vec stored by `roles`, because:
/// 1. non-gateway nodes are first so that they have lower numbers
/// 2. nodes that don't have a role are excluded (but they need to
/// stay in the CRDT as tombstones)
pub node_id_vec: Vec<Uuid>,
/// number of non-gateway nodes, which are the first ids in node_id_vec
pub nongateway_node_count: usize,
/// The assignation of data partitions to nodes, the values
/// are indices in node_id_vec
#[serde(with = "serde_bytes")]
pub ring_assignment_data: Vec<CompactNodeType>,
}
/// The staged changes for the next layout version
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LayoutStaging {
/// Parameters to be used in the next partition assignment computation.
pub parameters: Lww<LayoutParameters>,
/// Role changes which are staged for the next version of the layout
pub roles: LwwMap<Uuid, NodeRoleV>,
}
/// The tracker of acknowlegments and data syncs around the cluster
#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
pub struct UpdateTrackers {
/// The highest layout version number each node has ack'ed
pub ack_map: UpdateTracker,
/// The highest layout version number each node has synced data for
pub sync_map: UpdateTracker,
/// The highest layout version number each node has
/// ack'ed that all other nodes have synced data for
pub sync_ack_map: UpdateTracker,
}
/// Generic update tracker struct
#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
pub struct UpdateTracker(pub BTreeMap<Uuid, u64>);
impl garage_util::migrate::Migrate for LayoutHistory {
const VERSION_MARKER: &'static [u8] = b"G010lh";
type Previous = v09::ClusterLayout;
fn migrate(previous: Self::Previous) -> Self {
let nongateway_node_count = previous
.node_id_vec
.iter()
.enumerate()
.filter(|(_, uuid)| {
let role = previous.roles.get(uuid);
matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some())
})
.map(|(i, _)| i + 1)
.max()
.unwrap_or(0);
let version = LayoutVersion {
version: previous.version,
replication_factor: previous.replication_factor,
partition_size: previous.partition_size,
parameters: previous.parameters,
roles: previous.roles,
node_id_vec: previous.node_id_vec,
nongateway_node_count,
ring_assignment_data: previous.ring_assignment_data,
};
let update_tracker = UpdateTracker(
version
.nongateway_nodes()
.iter()
.copied()
.map(|x| (x, version.version))
.collect::<BTreeMap<Uuid, u64>>(),
);
let staging = LayoutStaging {
parameters: previous.staging_parameters,
roles: previous.staging_roles,
};
Self {
versions: vec![version],
old_versions: vec![],
update_trackers: UpdateTrackers {
ack_map: update_tracker.clone(),
sync_map: update_tracker.clone(),
sync_ack_map: update_tracker,
},
staging: Lww::raw(previous.version, staging),
}
}
}
}
pub use v010::*;
// ---- utility functions ----
impl AutoCrdt for LayoutParameters {
const WARN_IF_DIFFERENT: bool = true;
}
impl AutoCrdt for NodeRoleV {
const WARN_IF_DIFFERENT: bool = true;
}
impl Crdt for LayoutStaging {
fn merge(&mut self, other: &LayoutStaging) {
self.parameters.merge(&other.parameters);
self.roles.merge(&other.roles);
}
}
impl NodeRole {
pub fn capacity_string(&self) -> String {
match self.capacity {
Some(c) => ByteSize::b(c).to_string_as(false),
None => "gateway".to_string(),
}
}
pub fn tags_string(&self) -> String {
self.tags.join(",")
}
}
impl fmt::Display for ZoneRedundancy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ZoneRedundancy::Maximum => write!(f, "maximum"),
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
}
}
}
impl core::str::FromStr for ZoneRedundancy {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
x => {
let v = x
.parse::<usize>()
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
Ok(ZoneRedundancy::AtLeast(v))
}
}
}
}
impl UpdateTracker {
fn merge(&mut self, other: &UpdateTracker) -> bool {
let mut changed = false;
for (k, v) in other.0.iter() {
if let Some(v_mut) = self.0.get_mut(k) {
if *v > *v_mut {
*v_mut = *v;
changed = true;
}
} else {
self.0.insert(*k, *v);
changed = true;
}
}
changed
}
/// This bumps the update tracker for a given node up to the specified value.
/// This has potential impacts on the correctness of Garage and should only
/// be used in very specific circumstances.
pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool {
match self.0.get_mut(&peer) {
Some(e) if *e < value => {
*e = value;
true
}
None => {
self.0.insert(peer, value);
true
}
_ => false,
}
}
pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 {
storage_nodes
.iter()
.map(|x| self.get(x, min_version))
.min()
.unwrap_or(min_version)
}
pub fn get(&self, node: &Uuid, min_version: u64) -> u64 {
self.0.get(node).copied().unwrap_or(min_version)
}
}
impl UpdateTrackers {
pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool {
let c1 = self.ack_map.merge(&other.ack_map);
let c2 = self.sync_map.merge(&other.sync_map);
let c3 = self.sync_ack_map.merge(&other.sync_ack_map);
c1 || c2 || c3
}
}

158
src/rpc/layout/test.rs Normal file
View file

@ -0,0 +1,158 @@
use std::cmp::min;
use std::collections::HashMap;
use garage_util::crdt::Crdt;
use garage_util::error::*;
use crate::layout::*;
use crate::replication_mode::ReplicationFactor;
// This function checks that the partition size S computed is at least better than the
// one given by a very naive algorithm. To do so, we try to run the naive algorithm
// assuming a partion size of S+1. If we succed, it means that the optimal assignment
// was not optimal. The naive algorithm is the following :
// - we compute the max number of partitions associated to every node, capped at the
// partition number. It gives the number of tokens of every node.
// - every zone has a number of tokens equal to the sum of the tokens of its nodes.
// - we cycle over the partitions and associate zone tokens while respecting the
// zone redundancy constraint.
// NOTE: the naive algorithm is not optimal. Counter example:
// take nb_partition = 3 ; replication_factor = 5; redundancy = 4;
// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2)
// With these parameters, the naive algo fails, whereas there is a solution:
// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E)
fn check_against_naive(cl: &LayoutVersion) -> Result<bool, Error> {
let over_size = cl.partition_size + 1;
let mut zone_token = HashMap::<String, usize>::new();
let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?;
if zones.is_empty() {
return Ok(false);
}
for z in zones.iter() {
zone_token.insert(z.clone(), 0);
}
for uuid in cl.nongateway_nodes() {
let z = cl.expect_get_node_zone(&uuid);
let c = cl.expect_get_node_capacity(&uuid);
zone_token.insert(
z.to_string(),
zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize),
);
}
// For every partition, we count the number of zone already associated and
// the name of the last zone associated
let mut id_zone_token = vec![0; zones.len()];
for (z, t) in zone_token.iter() {
id_zone_token[zone_to_id[z]] = *t;
}
let mut nb_token = vec![0; NB_PARTITIONS];
let mut last_zone = vec![zones.len(); NB_PARTITIONS];
let mut curr_zone = 0;
let redundancy = cl.effective_zone_redundancy();
for replic in 0..cl.replication_factor {
for p in 0..NB_PARTITIONS {
while id_zone_token[curr_zone] == 0
|| (last_zone[p] == curr_zone
&& redundancy - nb_token[p] <= cl.replication_factor - replic)
{
curr_zone += 1;
if curr_zone >= zones.len() {
return Ok(true);
}
}
id_zone_token[curr_zone] -= 1;
if last_zone[p] != curr_zone {
nb_token[p] += 1;
last_zone[p] = curr_zone;
}
}
}
return Ok(false);
}
fn show_msg(msg: &Message) {
for s in msg.iter() {
println!("{}", s);
}
}
fn update_layout(
cl: &mut LayoutHistory,
node_capacity_vec: &[u64],
node_zone_vec: &[&'static str],
zone_redundancy: usize,
) {
let staging = cl.staging.get_mut();
for (i, (capacity, zone)) in node_capacity_vec
.iter()
.zip(node_zone_vec.iter())
.enumerate()
{
let node_id = [i as u8; 32].into();
let update = staging.roles.update_mutator(
node_id,
NodeRoleV(Some(NodeRole {
zone: zone.to_string(),
capacity: Some(*capacity),
tags: (vec![]),
})),
);
staging.roles.merge(&update);
}
staging.parameters.update(LayoutParameters {
zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
});
}
#[test]
fn test_assignment() {
let mut node_capacity_vec = vec![4000, 1000, 2000];
let mut node_zone_vec = vec!["A", "B", "C"];
let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap());
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
let v = cl.current().version;
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()).unwrap());
node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000];
node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"];
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2);
let v = cl.current().version;
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()).unwrap());
node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000];
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
let v = cl.current().version;
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()).unwrap());
node_capacity_vec = vec![
4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000,
];
update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1);
let v = cl.current().version;
let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(check_against_naive(cl.current()).unwrap());
}

View file

@ -1,375 +1,55 @@
use std::cmp::Ordering;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt; use std::convert::TryInto;
use bytesize::ByteSize; use bytesize::ByteSize;
use itertools::Itertools; use itertools::Itertools;
use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; use garage_util::crdt::{Crdt, LwwMap};
use garage_util::data::*; use garage_util::data::*;
use garage_util::encode::nonversioned_encode;
use garage_util::error::*; use garage_util::error::*;
use crate::graph_algo::*; use super::graph_algo::*;
use super::*;
use crate::ring::*;
use std::convert::TryInto;
const NB_PARTITIONS: usize = 1usize << PARTITION_BITS;
// The Message type will be used to collect information on the algorithm. // The Message type will be used to collect information on the algorithm.
type Message = Vec<String>; pub type Message = Vec<String>;
mod v08 { impl LayoutVersion {
use crate::ring::CompactNodeType;
use garage_util::crdt::LwwMap;
use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize};
/// The layout of the cluster, i.e. the list of roles
/// which are assigned to each cluster node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ClusterLayout {
pub version: u64,
pub replication_factor: usize,
pub roles: LwwMap<Uuid, NodeRoleV>,
/// node_id_vec: a vector of node IDs with a role assigned
/// in the system (this includes gateway nodes).
/// The order here is different than the vec stored by `roles`, because:
/// 1. non-gateway nodes are first so that they have lower numbers
/// 2. nodes that don't have a role are excluded (but they need to
/// stay in the CRDT as tombstones)
pub node_id_vec: Vec<Uuid>,
/// the assignation of data partitions to node, the values
/// are indices in node_id_vec
#[serde(with = "serde_bytes")]
pub ring_assignation_data: Vec<CompactNodeType>,
/// Role changes which are staged for the next version of the layout
pub staging: LwwMap<Uuid, NodeRoleV>,
pub staging_hash: Hash,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct NodeRoleV(pub Option<NodeRole>);
/// The user-assigned roles of cluster nodes
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct NodeRole {
/// Datacenter at which this entry belong. This information is used to
/// perform a better geodistribution
pub zone: String,
/// The capacity of the node
/// If this is set to None, the node does not participate in storing data for the system
/// and is only active as an API gateway to other nodes
pub capacity: Option<u64>,
/// A set of tags to recognize the node
pub tags: Vec<String>,
}
impl garage_util::migrate::InitialFormat for ClusterLayout {}
}
mod v09 {
use super::v08;
use crate::ring::CompactNodeType;
use garage_util::crdt::{Lww, LwwMap};
use garage_util::data::{Hash, Uuid};
use serde::{Deserialize, Serialize};
pub use v08::{NodeRole, NodeRoleV};
/// The layout of the cluster, i.e. the list of roles
/// which are assigned to each cluster node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ClusterLayout {
pub version: u64,
pub replication_factor: usize,
/// This attribute is only used to retain the previously computed partition size,
/// to know to what extent does it change with the layout update.
pub partition_size: u64,
/// Parameters used to compute the assignment currently given by
/// ring_assignment_data
pub parameters: LayoutParameters,
pub roles: LwwMap<Uuid, NodeRoleV>,
/// see comment in v08::ClusterLayout
pub node_id_vec: Vec<Uuid>,
/// see comment in v08::ClusterLayout
#[serde(with = "serde_bytes")]
pub ring_assignment_data: Vec<CompactNodeType>,
/// Parameters to be used in the next partition assignment computation.
pub staging_parameters: Lww<LayoutParameters>,
/// Role changes which are staged for the next version of the layout
pub staging_roles: LwwMap<Uuid, NodeRoleV>,
pub staging_hash: Hash,
}
/// This struct is used to set the parameters to be used in the assignment computation
/// algorithm. It is stored as a Crdt.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct LayoutParameters {
pub zone_redundancy: ZoneRedundancy,
}
/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
/// of each partition on at least that number of different zones.
/// Otherwise, copies will be stored on the maximum possible number of zones.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
pub enum ZoneRedundancy {
AtLeast(usize),
Maximum,
}
impl garage_util::migrate::Migrate for ClusterLayout {
const VERSION_MARKER: &'static [u8] = b"G09layout";
type Previous = v08::ClusterLayout;
fn migrate(previous: Self::Previous) -> Self {
use itertools::Itertools;
// In the old layout, capacities are in an arbitrary unit,
// but in the new layout they are in bytes.
// Here we arbitrarily multiply everything by 1G,
// such that 1 old capacity unit = 1GB in the new units.
// This is totally arbitrary and won't work for most users.
let cap_mul = 1024 * 1024 * 1024;
let roles = multiply_all_capacities(previous.roles, cap_mul);
let staging_roles = multiply_all_capacities(previous.staging, cap_mul);
let node_id_vec = previous.node_id_vec;
// Determine partition size
let mut tmp = previous.ring_assignation_data.clone();
tmp.sort();
let partition_size = tmp
.into_iter()
.dedup_with_count()
.map(|(npart, node)| {
roles
.get(&node_id_vec[node as usize])
.and_then(|p| p.0.as_ref().and_then(|r| r.capacity))
.unwrap_or(0) / npart as u64
})
.min()
.unwrap_or(0);
// By default, zone_redundancy is maximum possible value
let parameters = LayoutParameters {
zone_redundancy: ZoneRedundancy::Maximum,
};
let mut res = Self {
version: previous.version,
replication_factor: previous.replication_factor,
partition_size,
parameters,
roles,
node_id_vec,
ring_assignment_data: previous.ring_assignation_data,
staging_parameters: Lww::new(parameters),
staging_roles,
staging_hash: [0u8; 32].into(),
};
res.staging_hash = res.calculate_staging_hash();
res
}
}
fn multiply_all_capacities(
old_roles: LwwMap<Uuid, NodeRoleV>,
mul: u64,
) -> LwwMap<Uuid, NodeRoleV> {
let mut new_roles = LwwMap::new();
for (node, ts, role) in old_roles.items() {
let mut role = role.clone();
if let NodeRoleV(Some(NodeRole {
capacity: Some(ref mut cap),
..
})) = role
{
*cap *= mul;
}
new_roles.merge_raw(node, *ts, &role);
}
new_roles
}
}
pub use v09::*;
impl AutoCrdt for LayoutParameters {
const WARN_IF_DIFFERENT: bool = true;
}
impl AutoCrdt for NodeRoleV {
const WARN_IF_DIFFERENT: bool = true;
}
impl NodeRole {
pub fn capacity_string(&self) -> String {
match self.capacity {
Some(c) => ByteSize::b(c).to_string_as(false),
None => "gateway".to_string(),
}
}
pub fn tags_string(&self) -> String {
self.tags.join(",")
}
}
impl fmt::Display for ZoneRedundancy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ZoneRedundancy::Maximum => write!(f, "maximum"),
ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
}
}
}
impl core::str::FromStr for ZoneRedundancy {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
x => {
let v = x
.parse::<usize>()
.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
Ok(ZoneRedundancy::AtLeast(v))
}
}
}
}
// Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
impl ClusterLayout {
pub fn new(replication_factor: usize) -> Self { pub fn new(replication_factor: usize) -> Self {
// We set the default zone redundancy to be Maximum, meaning that the maximum // We set the default zone redundancy to be Maximum, meaning that the maximum
// possible value will be used depending on the cluster topology // possible value will be used depending on the cluster topology
let parameters = LayoutParameters { let parameters = LayoutParameters {
zone_redundancy: ZoneRedundancy::Maximum, zone_redundancy: ZoneRedundancy::Maximum,
}; };
let staging_parameters = Lww::<LayoutParameters>::new(parameters);
let empty_lwwmap = LwwMap::new(); LayoutVersion {
let mut ret = ClusterLayout {
version: 0, version: 0,
replication_factor, replication_factor,
partition_size: 0, partition_size: 0,
roles: LwwMap::new(), roles: LwwMap::new(),
node_id_vec: Vec::new(), node_id_vec: Vec::new(),
nongateway_node_count: 0,
ring_assignment_data: Vec::new(), ring_assignment_data: Vec::new(),
parameters, parameters,
staging_parameters,
staging_roles: empty_lwwmap,
staging_hash: [0u8; 32].into(),
};
ret.staging_hash = ret.calculate_staging_hash();
ret
}
fn calculate_staging_hash(&self) -> Hash {
let hashed_tuple = (&self.staging_roles, &self.staging_parameters);
blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..])
}
pub fn merge(&mut self, other: &ClusterLayout) -> bool {
match other.version.cmp(&self.version) {
Ordering::Greater => {
*self = other.clone();
true
}
Ordering::Equal => {
self.staging_parameters.merge(&other.staging_parameters);
self.staging_roles.merge(&other.staging_roles);
let new_staging_hash = self.calculate_staging_hash();
let changed = new_staging_hash != self.staging_hash;
self.staging_hash = new_staging_hash;
changed
}
Ordering::Less => false,
} }
} }
pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> { // ===================== accessors ======================
match version {
None => {
let error = r#"
Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
"#;
return Err(Error::Message(error.into()));
}
Some(v) => {
if v != self.version + 1 {
return Err(Error::Message("Invalid new layout version".into()));
}
}
}
self.roles.merge(&self.staging_roles); /// Returns a list of IDs of nodes that have a role in this
self.roles.retain(|(_, _, v)| v.0.is_some()); /// version of the cluster layout, including gateway nodes
self.parameters = *self.staging_parameters.get(); pub fn all_nodes(&self) -> &[Uuid] {
self.staging_roles.clear();
self.staging_hash = self.calculate_staging_hash();
let msg = self.calculate_partition_assignment()?;
self.version += 1;
Ok((self, msg))
}
pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
match version {
None => {
let error = r#"
Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
"#;
return Err(Error::Message(error.into()));
}
Some(v) => {
if v != self.version + 1 {
return Err(Error::Message("Invalid new layout version".into()));
}
}
}
self.staging_roles.clear();
self.staging_parameters.update(self.parameters);
self.staging_hash = self.calculate_staging_hash();
self.version += 1;
Ok(self)
}
/// Returns a list of IDs of nodes that currently have
/// a role in the cluster
pub fn node_ids(&self) -> &[Uuid] {
&self.node_id_vec[..] &self.node_id_vec[..]
} }
pub fn num_nodes(&self) -> usize { /// Returns a list of IDs of nodes that have a storage capacity
self.node_id_vec.len() /// assigned in this version of the cluster layout
pub fn nongateway_nodes(&self) -> &[Uuid] {
&self.node_id_vec[..self.nongateway_node_count]
} }
/// Returns the role of a node in the layout /// Returns the role of a node in the layout, if it has one
pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> {
match self.roles.get(node) { match self.roles.get(node) {
Some(NodeRoleV(Some(v))) => Some(v), Some(NodeRoleV(Some(v))) => Some(v),
@ -377,41 +57,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
} }
} }
/// Returns the uuids of the non_gateway nodes in self.node_id_vec. /// Returns the capacity of a node in the layout, if it has one
fn nongateway_nodes(&self) -> Vec<Uuid> { pub fn get_node_capacity(&self, uuid: &Uuid) -> Option<u64> {
let mut result = Vec::<Uuid>::new();
for uuid in self.node_id_vec.iter() {
match self.node_role(uuid) {
Some(role) if role.capacity.is_some() => result.push(*uuid),
_ => (),
}
}
result
}
/// Given a node uuids, this function returns the label of its zone
fn get_node_zone(&self, uuid: &Uuid) -> Result<String, Error> {
match self.node_role(uuid) {
Some(role) => Ok(role.zone.clone()),
_ => Err(Error::Message(
"The Uuid does not correspond to a node present in the cluster.".into(),
)),
}
}
/// Given a node uuids, this function returns its capacity or fails if it does not have any
pub fn get_node_capacity(&self, uuid: &Uuid) -> Result<u64, Error> {
match self.node_role(uuid) { match self.node_role(uuid) {
Some(NodeRole { Some(NodeRole {
capacity: Some(cap), capacity: Some(cap),
zone: _, zone: _,
tags: _, tags: _,
}) => Ok(*cap), }) => Some(*cap),
_ => Err(Error::Message( _ => None,
"The Uuid does not correspond to a node present in the \ }
cluster or this node does not have a positive capacity." }
.into(),
)), /// Given a node uuids, this function returns the label of its zone if it has one
pub fn get_node_zone(&self, uuid: &Uuid) -> Option<&str> {
match self.node_role(uuid) {
Some(role) => Some(&role.zone),
_ => None,
} }
} }
@ -435,17 +97,65 @@ To know the correct value of the new layout version, invoke `garage layout show`
)) ))
} }
/// Get the partition in which data would fall on
pub fn partition_of(&self, position: &Hash) -> Partition {
let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
/// Get the list of partitions and the first hash of a partition key that would fall in it
pub fn partitions(&self) -> impl Iterator<Item = (Partition, Hash)> + '_ {
(0..(1 << PARTITION_BITS)).map(|i| {
let top = (i as u16) << (16 - PARTITION_BITS);
let mut location = [0u8; 32];
location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]);
(i as u16, Hash::from(location))
})
}
/// Return the n servers in which data for this hash should be replicated
pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator<Item = Uuid> + '_ {
assert_eq!(n, self.replication_factor);
let data = &self.ring_assignment_data;
let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) {
let partition_idx = self.partition_of(position) as usize;
let partition_start = partition_idx * self.replication_factor;
let partition_end = (partition_idx + 1) * self.replication_factor;
&data[partition_start..partition_end]
} else {
warn!("Ring not yet ready, read/writes will be lost!");
&[]
};
partition_nodes
.iter()
.map(move |i| self.node_id_vec[*i as usize])
}
// ===================== internal information extractors ======================
pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 {
self.get_node_capacity(uuid)
.expect("non-gateway node with zero capacity")
}
pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str {
self.get_node_zone(uuid).expect("node without a zone")
}
/// Returns the sum of capacities of non gateway nodes in the cluster /// Returns the sum of capacities of non gateway nodes in the cluster
fn get_total_capacity(&self) -> Result<u64, Error> { fn get_total_capacity(&self) -> u64 {
let mut total_capacity = 0; let mut total_capacity = 0;
for uuid in self.nongateway_nodes().iter() { for uuid in self.nongateway_nodes() {
total_capacity += self.get_node_capacity(uuid)?; total_capacity += self.expect_get_node_capacity(uuid);
} }
Ok(total_capacity) total_capacity
} }
/// Returns the effective value of the zone_redundancy parameter /// Returns the effective value of the zone_redundancy parameter
fn effective_zone_redundancy(&self) -> usize { pub(crate) fn effective_zone_redundancy(&self) -> usize {
match self.parameters.zone_redundancy { match self.parameters.zone_redundancy {
ZoneRedundancy::AtLeast(v) => v, ZoneRedundancy::AtLeast(v) => v,
ZoneRedundancy::Maximum => { ZoneRedundancy::Maximum => {
@ -465,10 +175,14 @@ To know the correct value of the new layout version, invoke `garage layout show`
/// (assignment, roles, parameters, partition size) /// (assignment, roles, parameters, partition size)
/// returns true if consistent, false if error /// returns true if consistent, false if error
pub fn check(&self) -> Result<(), String> { pub fn check(&self) -> Result<(), String> {
// Check that the hash of the staging data is correct // Check that the assignment data has the correct length
let staging_hash = self.calculate_staging_hash(); let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor;
if staging_hash != self.staging_hash { if self.ring_assignment_data.len() != expected_assignment_data_len {
return Err("staging_hash is incorrect".into()); return Err(format!(
"ring_assignment_data has incorrect length {} instead of {}",
self.ring_assignment_data.len(),
expected_assignment_data_len
));
} }
// Check that node_id_vec contains the correct list of nodes // Check that node_id_vec contains the correct list of nodes
@ -486,16 +200,6 @@ To know the correct value of the new layout version, invoke `garage layout show`
return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes));
} }
// Check that the assignment data has the correct length
let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor;
if self.ring_assignment_data.len() != expected_assignment_data_len {
return Err(format!(
"ring_assignment_data has incorrect length {} instead of {}",
self.ring_assignment_data.len(),
expected_assignment_data_len
));
}
// Check that the assigned nodes are correct identifiers // Check that the assigned nodes are correct identifiers
// of nodes that are assigned a role // of nodes that are assigned a role
// and that role is not the role of a gateway nodes // and that role is not the role of a gateway nodes
@ -524,10 +228,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
// Check that every partition is spread over at least zone_redundancy zones. // Check that every partition is spread over at least zone_redundancy zones.
let zones_of_p = nodes_of_p let zones_of_p = nodes_of_p
.iter() .iter()
.map(|n| { .map(|n| self.expect_get_node_zone(&self.node_id_vec[*n as usize]))
self.get_node_zone(&self.node_id_vec[*n as usize])
.expect("Zone not found.")
})
.collect::<Vec<_>>(); .collect::<Vec<_>>();
if zones_of_p.iter().unique().count() < zone_redundancy { if zones_of_p.iter().unique().count() < zone_redundancy {
return Err(format!( return Err(format!(
@ -546,7 +247,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
if *usage > 0 { if *usage > 0 {
let uuid = self.node_id_vec[n]; let uuid = self.node_id_vec[n];
let partusage = usage * self.partition_size; let partusage = usage * self.partition_size;
let nodecap = self.get_node_capacity(&uuid).unwrap(); let nodecap = self.expect_get_node_capacity(&uuid);
if partusage > nodecap { if partusage > nodecap {
return Err(format!( return Err(format!(
"node usage ({}) is bigger than node capacity ({})", "node usage ({}) is bigger than node capacity ({})",
@ -574,12 +275,24 @@ To know the correct value of the new layout version, invoke `garage layout show`
Ok(()) Ok(())
} }
}
// ==================================================================================== // ================== updates to layout, internals ===================
pub(crate) fn calculate_next_version(
mut self,
staging: &LayoutStaging,
) -> Result<(Self, Message), Error> {
self.version += 1;
self.roles.merge(&staging.roles);
self.roles.retain(|(_, _, v)| v.0.is_some());
self.parameters = *staging.parameters.get();
let msg = self.calculate_partition_assignment()?;
Ok((self, msg))
}
// Implementation of the ClusterLayout methods related to the assignment algorithm.
impl ClusterLayout {
/// This function calculates a new partition-to-node assignment. /// This function calculates a new partition-to-node assignment.
/// The computed assignment respects the node replication factor /// The computed assignment respects the node replication factor
/// and the zone redundancy parameter It maximizes the capacity of a /// and the zone redundancy parameter It maximizes the capacity of a
@ -609,12 +322,12 @@ impl ClusterLayout {
// to use them as indices in the flow graphs. // to use them as indices in the flow graphs.
let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?;
let nb_nongateway_nodes = self.nongateway_nodes().len(); if self.nongateway_nodes().len() < self.replication_factor {
if nb_nongateway_nodes < self.replication_factor {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"The number of nodes with positive \ "The number of nodes with positive \
capacity ({}) is smaller than the replication factor ({}).", capacity ({}) is smaller than the replication factor ({}).",
nb_nongateway_nodes, self.replication_factor self.nongateway_nodes().len(),
self.replication_factor
))); )));
} }
if id_to_zone.len() < zone_redundancy { if id_to_zone.len() < zone_redundancy {
@ -712,12 +425,14 @@ impl ClusterLayout {
.map(|(k, _, _)| *k) .map(|(k, _, _)| *k)
.collect(); .collect();
let mut new_node_id_vec = Vec::<Uuid>::new(); let old_node_id_vec = std::mem::take(&mut self.node_id_vec);
new_node_id_vec.extend(new_non_gateway_nodes);
new_node_id_vec.extend(new_gateway_nodes);
let old_node_id_vec = self.node_id_vec.clone(); self.nongateway_node_count = new_non_gateway_nodes.len();
self.node_id_vec = new_node_id_vec.clone(); self.node_id_vec.clear();
self.node_id_vec.extend(new_non_gateway_nodes);
self.node_id_vec.extend(new_gateway_nodes);
let new_node_id_vec = &self.node_id_vec;
// (2) We retrieve the old association // (2) We retrieve the old association
// We rewrite the old association with the new indices. We only consider partition // We rewrite the old association with the new indices. We only consider partition
@ -756,7 +471,7 @@ impl ClusterLayout {
} }
} }
// We write the ring // We clear the ring assignemnt data
self.ring_assignment_data = Vec::<CompactNodeType>::new(); self.ring_assignment_data = Vec::<CompactNodeType>::new();
Ok(Some(old_assignment)) Ok(Some(old_assignment))
@ -764,7 +479,9 @@ impl ClusterLayout {
/// This function generates ids for the zone of the nodes appearing in /// This function generates ids for the zone of the nodes appearing in
/// self.node_id_vec. /// self.node_id_vec.
fn generate_nongateway_zone_ids(&self) -> Result<(Vec<String>, HashMap<String, usize>), Error> { pub(crate) fn generate_nongateway_zone_ids(
&self,
) -> Result<(Vec<String>, HashMap<String, usize>), Error> {
let mut id_to_zone = Vec::<String>::new(); let mut id_to_zone = Vec::<String>::new();
let mut zone_to_id = HashMap::<String, usize>::new(); let mut zone_to_id = HashMap::<String, usize>::new();
@ -797,7 +514,7 @@ impl ClusterLayout {
} }
let mut s_down = 1; let mut s_down = 1;
let mut s_up = self.get_total_capacity()?; let mut s_up = self.get_total_capacity();
while s_down + 1 < s_up { while s_down + 1 < s_up {
g = self.generate_flow_graph( g = self.generate_flow_graph(
(s_down + s_up) / 2, (s_down + s_up) / 2,
@ -846,7 +563,7 @@ impl ClusterLayout {
zone_redundancy: usize, zone_redundancy: usize,
) -> Result<Graph<FlowEdge>, Error> { ) -> Result<Graph<FlowEdge>, Error> {
let vertices = let vertices =
ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
let mut g = Graph::<FlowEdge>::new(&vertices); let mut g = Graph::<FlowEdge>::new(&vertices);
let nb_zones = zone_to_id.len(); let nb_zones = zone_to_id.len();
for p in 0..NB_PARTITIONS { for p in 0..NB_PARTITIONS {
@ -866,8 +583,8 @@ impl ClusterLayout {
} }
} }
for n in 0..self.nongateway_nodes().len() { for n in 0..self.nongateway_nodes().len() {
let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]);
let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[n])];
g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?;
for p in 0..NB_PARTITIONS { for p in 0..NB_PARTITIONS {
if !exclude_assoc.contains(&(p, n)) { if !exclude_assoc.contains(&(p, n)) {
@ -913,7 +630,7 @@ impl ClusterLayout {
// The algorithm is such that it will start with the flow that we just computed // The algorithm is such that it will start with the flow that we just computed
// and find ameliorating paths from that. // and find ameliorating paths from that.
for (p, n) in exclude_edge.iter() { for (p, n) in exclude_edge.iter() {
let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])];
g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?;
} }
g.compute_maximal_flow()?; g.compute_maximal_flow()?;
@ -933,7 +650,7 @@ impl ClusterLayout {
let mut cost = CostFunction::new(); let mut cost = CostFunction::new();
for (p, assoc_p) in prev_assign.iter().enumerate() { for (p, assoc_p) in prev_assign.iter().enumerate() {
for n in assoc_p.iter() { for n in assoc_p.iter() {
let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])];
cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1);
} }
} }
@ -988,7 +705,7 @@ impl ClusterLayout {
let mut msg = Message::new(); let mut msg = Message::new();
let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
let total_cap = self.get_total_capacity()?; let total_cap = self.get_total_capacity();
let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
msg.push(format!( msg.push(format!(
"Usable capacity / total cluster capacity: {} / {} ({:.1} %)", "Usable capacity / total cluster capacity: {} / {} ({:.1} %)",
@ -1035,7 +752,7 @@ impl ClusterLayout {
let mut old_zones_of_p = Vec::<usize>::new(); let mut old_zones_of_p = Vec::<usize>::new();
for n in prev_assign[p].iter() { for n in prev_assign[p].iter() {
old_zones_of_p old_zones_of_p
.push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); .push(zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]);
} }
if !old_zones_of_p.contains(&z) { if !old_zones_of_p.contains(&z) {
new_partitions_zone[z] += 1; new_partitions_zone[z] += 1;
@ -1077,7 +794,7 @@ impl ClusterLayout {
for z in 0..id_to_zone.len() { for z in 0..id_to_zone.len() {
let mut nodes_of_z = Vec::<usize>::new(); let mut nodes_of_z = Vec::<usize>::new();
for n in 0..storing_nodes.len() { for n in 0..storing_nodes.len() {
if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { if self.expect_get_node_zone(&self.node_id_vec[n]) == id_to_zone[z] {
nodes_of_z.push(n); nodes_of_z.push(n);
} }
} }
@ -1091,13 +808,13 @@ impl ClusterLayout {
let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
let mut total_cap_z = 0; let mut total_cap_z = 0;
for n in nodes_of_z.iter() { for n in nodes_of_z.iter() {
total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]);
} }
let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
for n in nodes_of_z.iter() { for n in nodes_of_z.iter() {
let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]);
let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string(); let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
table.push(format!( table.push(format!(
" {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
@ -1127,167 +844,3 @@ impl ClusterLayout {
Ok(msg) Ok(msg)
} }
} }
// ====================================================================================
#[cfg(test)]
mod tests {
use super::{Error, *};
use std::cmp::min;
// This function checks that the partition size S computed is at least better than the
// one given by a very naive algorithm. To do so, we try to run the naive algorithm
// assuming a partion size of S+1. If we succed, it means that the optimal assignment
// was not optimal. The naive algorithm is the following :
// - we compute the max number of partitions associated to every node, capped at the
// partition number. It gives the number of tokens of every node.
// - every zone has a number of tokens equal to the sum of the tokens of its nodes.
// - we cycle over the partitions and associate zone tokens while respecting the
// zone redundancy constraint.
// NOTE: the naive algorithm is not optimal. Counter example:
// take nb_partition = 3 ; replication_factor = 5; redundancy = 4;
// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2)
// With these parameters, the naive algo fails, whereas there is a solution:
// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E)
fn check_against_naive(cl: &ClusterLayout) -> Result<bool, Error> {
let over_size = cl.partition_size + 1;
let mut zone_token = HashMap::<String, usize>::new();
let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?;
if zones.is_empty() {
return Ok(false);
}
for z in zones.iter() {
zone_token.insert(z.clone(), 0);
}
for uuid in cl.nongateway_nodes().iter() {
let z = cl.get_node_zone(uuid)?;
let c = cl.get_node_capacity(uuid)?;
zone_token.insert(
z.clone(),
zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize),
);
}
// For every partition, we count the number of zone already associated and
// the name of the last zone associated
let mut id_zone_token = vec![0; zones.len()];
for (z, t) in zone_token.iter() {
id_zone_token[zone_to_id[z]] = *t;
}
let mut nb_token = vec![0; NB_PARTITIONS];
let mut last_zone = vec![zones.len(); NB_PARTITIONS];
let mut curr_zone = 0;
let redundancy = cl.effective_zone_redundancy();
for replic in 0..cl.replication_factor {
for p in 0..NB_PARTITIONS {
while id_zone_token[curr_zone] == 0
|| (last_zone[p] == curr_zone
&& redundancy - nb_token[p] <= cl.replication_factor - replic)
{
curr_zone += 1;
if curr_zone >= zones.len() {
return Ok(true);
}
}
id_zone_token[curr_zone] -= 1;
if last_zone[p] != curr_zone {
nb_token[p] += 1;
last_zone[p] = curr_zone;
}
}
}
return Ok(false);
}
fn show_msg(msg: &Message) {
for s in msg.iter() {
println!("{}", s);
}
}
fn update_layout(
cl: &mut ClusterLayout,
node_id_vec: &Vec<u8>,
node_capacity_vec: &Vec<u64>,
node_zone_vec: &Vec<String>,
zone_redundancy: usize,
) {
for i in 0..node_id_vec.len() {
if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) {
cl.node_id_vec.push(x);
}
let update = cl.staging_roles.update_mutator(
cl.node_id_vec[i],
NodeRoleV(Some(NodeRole {
zone: (node_zone_vec[i].to_string()),
capacity: (Some(node_capacity_vec[i])),
tags: (vec![]),
})),
);
cl.staging_roles.merge(&update);
}
cl.staging_parameters.update(LayoutParameters {
zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
});
cl.staging_hash = cl.calculate_staging_hash();
}
#[test]
fn test_assignment() {
let mut node_id_vec = vec![1, 2, 3];
let mut node_capacity_vec = vec![4000, 1000, 2000];
let mut node_zone_vec = vec!["A", "B", "C"]
.into_iter()
.map(|x| x.to_string())
.collect();
let mut cl = ClusterLayout::new(3);
update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3);
let v = cl.version;
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(matches!(check_against_naive(&cl), Ok(true)));
node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000];
node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"]
.into_iter()
.map(|x| x.to_string())
.collect();
update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2);
let v = cl.version;
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(matches!(check_against_naive(&cl), Ok(true)));
node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000];
update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3);
let v = cl.version;
let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(matches!(check_against_naive(&cl), Ok(true)));
node_capacity_vec = vec![
4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000,
];
update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1);
let v = cl.version;
let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
show_msg(&msg);
assert_eq!(cl.check(), Ok(()));
assert!(matches!(check_against_naive(&cl), Ok(true)));
}
}

View file

@ -11,10 +11,8 @@ mod consul;
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
mod kubernetes; mod kubernetes;
pub mod graph_algo;
pub mod layout; pub mod layout;
pub mod replication_mode; pub mod replication_mode;
pub mod ring;
pub mod system; pub mod system;
pub mod rpc_helper; pub mod rpc_helper;

View file

@ -1,57 +1,94 @@
#[derive(Clone, Copy)] use garage_util::config::Config;
pub enum ReplicationMode { use garage_util::crdt::AutoCrdt;
None, use garage_util::error::*;
TwoWay, use serde::{Deserialize, Serialize};
TwoWayDangerous,
ThreeWay, #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
ThreeWayDegraded, #[serde(transparent)]
ThreeWayDangerous, pub struct ReplicationFactor(usize);
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ConsistencyMode {
/// Read- and Write-quorum are 1
Dangerous,
/// Read-quorum is 1
Degraded,
/// Read- and Write-quorum are determined for read-after-write-consistency
#[default]
Consistent,
} }
impl ReplicationMode { impl ConsistencyMode {
pub fn parse(v: &str) -> Option<Self> { pub fn parse(s: &str) -> Option<Self> {
match v { serde_json::from_value(serde_json::Value::String(s.to_string())).ok()
"none" | "1" => Some(Self::None),
"2" => Some(Self::TwoWay),
"2-dangerous" => Some(Self::TwoWayDangerous),
"3" => Some(Self::ThreeWay),
"3-degraded" => Some(Self::ThreeWayDegraded),
"3-dangerous" => Some(Self::ThreeWayDangerous),
_ => None,
}
} }
}
pub fn control_write_max_faults(&self) -> usize { impl AutoCrdt for ConsistencyMode {
match self { const WARN_IF_DIFFERENT: bool = true;
Self::None => 0, }
_ => 1,
impl ReplicationFactor {
pub fn new(replication_factor: usize) -> Option<Self> {
if replication_factor < 1 {
None
} else {
Some(Self(replication_factor))
} }
} }
pub fn replication_factor(&self) -> usize { pub fn replication_factor(&self) -> usize {
match self { self.0
Self::None => 1, }
Self::TwoWay | Self::TwoWayDangerous => 2,
Self::ThreeWay | Self::ThreeWayDegraded | Self::ThreeWayDangerous => 3, pub fn read_quorum(&self, consistency_mode: ConsistencyMode) -> usize {
match consistency_mode {
ConsistencyMode::Dangerous | ConsistencyMode::Degraded => 1,
ConsistencyMode::Consistent => self.replication_factor().div_ceil(2),
} }
} }
pub fn read_quorum(&self) -> usize { pub fn write_quorum(&self, consistency_mode: ConsistencyMode) -> usize {
match self { match consistency_mode {
Self::None => 1, ConsistencyMode::Dangerous => 1,
Self::TwoWay | Self::TwoWayDangerous => 1, ConsistencyMode::Degraded | ConsistencyMode::Consistent => {
Self::ThreeWay => 2, (self.replication_factor() + 1) - self.read_quorum(ConsistencyMode::Consistent)
Self::ThreeWayDegraded | Self::ThreeWayDangerous => 1, }
}
}
pub fn write_quorum(&self) -> usize {
match self {
Self::None => 1,
Self::TwoWay => 2,
Self::TwoWayDangerous => 1,
Self::ThreeWay | Self::ThreeWayDegraded => 2,
Self::ThreeWayDangerous => 1,
} }
} }
} }
impl std::convert::From<ReplicationFactor> for usize {
fn from(replication_factor: ReplicationFactor) -> usize {
replication_factor.0
}
}
pub fn parse_replication_mode(
config: &Config,
) -> Result<(ReplicationFactor, ConsistencyMode), Error> {
match (&config.replication_mode, config.replication_factor, config.consistency_mode.as_str()) {
(Some(replication_mode), None, "consistent") => {
tracing::warn!("Legacy config option replication_mode in use. Please migrate to replication_factor and consistency_mode");
let parsed_replication_mode = match replication_mode.as_str() {
"1" | "none" => Some((ReplicationFactor(1), ConsistencyMode::Consistent)),
"2" => Some((ReplicationFactor(2), ConsistencyMode::Consistent)),
"2-dangerous" => Some((ReplicationFactor(2), ConsistencyMode::Dangerous)),
"3" => Some((ReplicationFactor(3), ConsistencyMode::Consistent)),
"3-degraded" => Some((ReplicationFactor(3), ConsistencyMode::Degraded)),
"3-dangerous" => Some((ReplicationFactor(3), ConsistencyMode::Dangerous)),
_ => None,
};
Some(parsed_replication_mode.ok_or_message("Invalid replication_mode in config file.")?)
},
(None, Some(replication_factor), consistency_mode) => {
let replication_factor = ReplicationFactor::new(replication_factor)
.ok_or_message("Invalid replication_factor in config file.")?;
let consistency_mode = ConsistencyMode::parse(consistency_mode)
.ok_or_message("Invalid consistency_mode in config file.")?;
Some((replication_factor, consistency_mode))
}
_ => None,
}.ok_or_message("Either the legacy replication_mode or replication_level and consistency_mode can be set, not both.")
}

View file

@ -1,164 +0,0 @@
//! Module containing types related to computing nodes which should receive a copy of data blocks
//! and metadata
use std::convert::TryInto;
use garage_util::data::*;
use crate::layout::ClusterLayout;
/// A partition id, which is stored on 16 bits
/// i.e. we have up to 2**16 partitions.
/// (in practice we have exactly 2**PARTITION_BITS partitions)
pub type Partition = u16;
// TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump
// it up to 10.
// Maximum value : 16
/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
/// presence of numerous nodes, but exponentially bigger ring. Max 16
pub const PARTITION_BITS: usize = 8;
const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
/// A ring distributing fairly objects to nodes
#[derive(Clone)]
pub struct Ring {
/// The replication factor for this ring
pub replication_factor: usize,
/// The network configuration used to generate this ring
pub layout: ClusterLayout,
// Internal order of nodes used to make a more compact representation of the ring
nodes: Vec<Uuid>,
// The list of entries in the ring
ring: Vec<RingEntry>,
}
// Type to store compactly the id of a node in the system
// Change this to u16 the day we want to have more than 256 nodes in a cluster
pub type CompactNodeType = u8;
pub const MAX_NODE_NUMBER: usize = 256;
// The maximum number of times an object might get replicated
// This must be at least 3 because Garage supports 3-way replication
// Here we use 6 so that the size of a ring entry is 8 bytes
// (2 bytes partition id, 6 bytes node numbers as u8s)
const MAX_REPLICATION: usize = 6;
/// An entry in the ring
#[derive(Clone, Debug)]
struct RingEntry {
// The two first bytes of the first hash that goes in this partition
// (the next bytes are zeroes)
hash_prefix: u16,
// The nodes that store this partition, stored as a list of positions in the `nodes`
// field of the Ring structure
// Only items 0 up to ring.replication_factor - 1 are used, others are zeros
nodes_buf: [CompactNodeType; MAX_REPLICATION],
}
impl Ring {
pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self {
if replication_factor != layout.replication_factor {
warn!("Could not build ring: replication factor does not match between local configuration and network role assignment.");
return Self::empty(layout, replication_factor);
}
if layout.ring_assignment_data.len() != replication_factor * (1 << PARTITION_BITS) {
warn!("Could not build ring: network role assignment data has invalid length");
return Self::empty(layout, replication_factor);
}
let nodes = layout.node_id_vec.clone();
let ring = (0..(1 << PARTITION_BITS))
.map(|i| {
let top = (i as u16) << (16 - PARTITION_BITS);
let mut nodes_buf = [0u8; MAX_REPLICATION];
nodes_buf[..replication_factor].copy_from_slice(
&layout.ring_assignment_data
[replication_factor * i..replication_factor * (i + 1)],
);
RingEntry {
hash_prefix: top,
nodes_buf,
}
})
.collect::<Vec<_>>();
Self {
replication_factor,
layout,
nodes,
ring,
}
}
fn empty(layout: ClusterLayout, replication_factor: usize) -> Self {
Self {
replication_factor,
layout,
nodes: vec![],
ring: vec![],
}
}
/// Get the partition in which data would fall on
pub fn partition_of(&self, position: &Hash) -> Partition {
let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
/// Get the list of partitions and the first hash of a partition key that would fall in it
pub fn partitions(&self) -> Vec<(Partition, Hash)> {
let mut ret = vec![];
for (i, entry) in self.ring.iter().enumerate() {
let mut location = [0u8; 32];
location[..2].copy_from_slice(&u16::to_be_bytes(entry.hash_prefix)[..]);
ret.push((i as u16, location.into()));
}
if !ret.is_empty() {
assert_eq!(ret[0].1, [0u8; 32].into());
}
ret
}
/// Walk the ring to find the n servers in which data should be replicated
pub fn get_nodes(&self, position: &Hash, n: usize) -> Vec<Uuid> {
if self.ring.len() != 1 << PARTITION_BITS {
warn!("Ring not yet ready, read/writes will be lost!");
return vec![];
}
let partition_idx = self.partition_of(position) as usize;
let partition = &self.ring[partition_idx];
let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());
// Check that we haven't messed up our partition table, i.e. that this partition
// table entrey indeed corresponds to the item we are storing
assert_eq!(
partition.hash_prefix & PARTITION_MASK_U16,
top & PARTITION_MASK_U16
);
assert!(n <= self.replication_factor);
partition.nodes_buf[..n]
.iter()
.map(|i| self.nodes[*i as usize])
.collect::<Vec<_>>()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ring_entry_size() {
assert_eq!(std::mem::size_of::<RingEntry>(), 8);
}
}

View file

@ -1,12 +1,12 @@
//! Contain structs related to making RPCs //! Contain structs related to making RPCs
use std::sync::Arc; use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::time::Duration; use std::time::Duration;
use futures::future::join_all; use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt; use futures::stream::StreamExt;
use tokio::select; use tokio::select;
use tokio::sync::watch;
use opentelemetry::KeyValue; use opentelemetry::KeyValue;
use opentelemetry::{ use opentelemetry::{
@ -26,8 +26,8 @@ use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::metrics::RecordDuration; use garage_util::metrics::RecordDuration;
use crate::layout::{LayoutHelper, LayoutHistory};
use crate::metrics::RpcMetrics; use crate::metrics::RpcMetrics;
use crate::ring::Ring;
// Default RPC timeout = 5 minutes // Default RPC timeout = 5 minutes
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
@ -36,11 +36,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct RequestStrategy { pub struct RequestStrategy {
/// Min number of response to consider the request successful /// Min number of response to consider the request successful
pub rs_quorum: Option<usize>, rs_quorum: Option<usize>,
/// Should requests be dropped after enough response are received /// Send all requests at once
pub rs_interrupt_after_quorum: bool, rs_send_all_at_once: Option<bool>,
/// Request priority /// Request priority
pub rs_priority: RequestPriority, rs_priority: RequestPriority,
/// Custom timeout for this request /// Custom timeout for this request
rs_timeout: Timeout, rs_timeout: Timeout,
} }
@ -57,7 +57,7 @@ impl RequestStrategy {
pub fn with_priority(prio: RequestPriority) -> Self { pub fn with_priority(prio: RequestPriority) -> Self {
RequestStrategy { RequestStrategy {
rs_quorum: None, rs_quorum: None,
rs_interrupt_after_quorum: false, rs_send_all_at_once: None,
rs_priority: prio, rs_priority: prio,
rs_timeout: Timeout::Default, rs_timeout: Timeout::Default,
} }
@ -67,10 +67,9 @@ impl RequestStrategy {
self.rs_quorum = Some(quorum); self.rs_quorum = Some(quorum);
self self
} }
/// Set if requests can be dropped after quorum has been reached /// Set quorum to be reached for request
/// In general true for read requests, and false for write pub fn send_all_at_once(mut self, value: bool) -> Self {
pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self { self.rs_send_all_at_once = Some(value);
self.rs_interrupt_after_quorum = interrupt;
self self
} }
/// Deactivate timeout for this request /// Deactivate timeout for this request
@ -91,7 +90,7 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
struct RpcHelperInner { struct RpcHelperInner {
our_node_id: Uuid, our_node_id: Uuid,
peering: Arc<PeeringManager>, peering: Arc<PeeringManager>,
ring: watch::Receiver<Arc<Ring>>, layout: Arc<RwLock<LayoutHelper>>,
metrics: RpcMetrics, metrics: RpcMetrics,
rpc_timeout: Duration, rpc_timeout: Duration,
} }
@ -100,7 +99,7 @@ impl RpcHelper {
pub(crate) fn new( pub(crate) fn new(
our_node_id: Uuid, our_node_id: Uuid,
peering: Arc<PeeringManager>, peering: Arc<PeeringManager>,
ring: watch::Receiver<Arc<Ring>>, layout: Arc<RwLock<LayoutHelper>>,
rpc_timeout: Option<Duration>, rpc_timeout: Option<Duration>,
) -> Self { ) -> Self {
let metrics = RpcMetrics::new(); let metrics = RpcMetrics::new();
@ -108,7 +107,7 @@ impl RpcHelper {
Self(Arc::new(RpcHelperInner { Self(Arc::new(RpcHelperInner {
our_node_id, our_node_id,
peering, peering,
ring, layout,
metrics, metrics,
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
})) }))
@ -130,6 +129,12 @@ impl RpcHelper {
N: IntoReq<M> + Send, N: IntoReq<M> + Send,
H: StreamingEndpointHandler<M>, H: StreamingEndpointHandler<M>,
{ {
let tracer = opentelemetry::global::tracer("garage");
let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to);
let mut span = tracer.start(span_name);
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
let metric_tags = [ let metric_tags = [
KeyValue::new("rpc_endpoint", endpoint.path().to_string()), KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
KeyValue::new("from", format!("{:?}", self.0.our_node_id)), KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
@ -141,6 +146,7 @@ impl RpcHelper {
let node_id = to.into(); let node_id = to.into();
let rpc_call = endpoint let rpc_call = endpoint
.call_streaming(&node_id, msg, strat.rs_priority) .call_streaming(&node_id, msg, strat.rs_priority)
.with_context(Context::current_with_span(span))
.record_duration(&self.0.metrics.rpc_duration, &metric_tags); .record_duration(&self.0.metrics.rpc_duration, &metric_tags);
let timeout = async { let timeout = async {
@ -183,12 +189,17 @@ impl RpcHelper {
N: IntoReq<M>, N: IntoReq<M>,
H: StreamingEndpointHandler<M>, H: StreamingEndpointHandler<M>,
{ {
let tracer = opentelemetry::global::tracer("garage");
let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len());
let span = tracer.start(span_name);
let msg = msg.into_req().map_err(garage_net::error::Error::from)?; let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
let resps = join_all( let resps = join_all(
to.iter() to.iter()
.map(|to| self.call(endpoint, *to, msg.clone(), strat)), .map(|to| self.call(endpoint, *to, msg.clone(), strat)),
) )
.with_context(Context::current_with_span(span))
.await; .await;
Ok(to Ok(to
.iter() .iter()
@ -220,6 +231,22 @@ impl RpcHelper {
/// Make a RPC call to multiple servers, returning either a Vec of responses, /// Make a RPC call to multiple servers, returning either a Vec of responses,
/// or an error if quorum could not be reached due to too many errors /// or an error if quorum could not be reached due to too many errors
///
/// If RequestStrategy has send_all_at_once set, then all requests will be
/// sent at once, and `try_call_many` will return as soon as a quorum of
/// responses is achieved, dropping and cancelling the remaining requests.
///
/// Otherwise, `quorum` requests will be sent at the same time, and if an
/// error response is received, a new request will be sent to replace it.
/// The ordering of nodes to which requests are sent is determined by
/// the `RpcHelper::request_order` function, which takes into account
/// parameters such as node zones and measured ping values.
///
/// In both cases, the basic contract of this function is that even in the
/// absence of failures, the RPC call might not be driven to completion
/// on all of the specified nodes. It is therefore unfit for broadcast
/// write operations where we expect all nodes to successfully store
/// the written date.
pub async fn try_call_many<M, N, H, S>( pub async fn try_call_many<M, N, H, S>(
&self, &self,
endpoint: &Arc<Endpoint<M, H>>, endpoint: &Arc<Endpoint<M, H>>,
@ -236,31 +263,24 @@ impl RpcHelper {
let quorum = strategy.rs_quorum.unwrap_or(to.len()); let quorum = strategy.rs_quorum.unwrap_or(to.len());
let tracer = opentelemetry::global::tracer("garage"); let tracer = opentelemetry::global::tracer("garage");
let span_name = if strategy.rs_interrupt_after_quorum { let span_name = format!(
format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len()) "RPC [{}] try_call_many (quorum {}/{})",
} else { endpoint.path(),
format!( quorum,
"RPC {} to {} (quorum {})", to.len()
endpoint.path(), );
to.len(),
quorum
)
};
let mut span = tracer.start(span_name); let mut span = tracer.start(span_name);
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
span.set_attribute(KeyValue::new("to", format!("{:?}", to))); span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
span.set_attribute(KeyValue::new("quorum", quorum as i64)); span.set_attribute(KeyValue::new("quorum", quorum as i64));
span.set_attribute(KeyValue::new(
"interrupt_after_quorum",
strategy.rs_interrupt_after_quorum.to_string(),
));
self.try_call_many_internal(endpoint, to, msg, strategy, quorum) self.try_call_many_inner(endpoint, to, msg, strategy, quorum)
.with_context(Context::current_with_span(span)) .with_context(Context::current_with_span(span))
.await .await
} }
async fn try_call_many_internal<M, N, H, S>( async fn try_call_many_inner<M, N, H, S>(
&self, &self,
endpoint: &Arc<Endpoint<M, H>>, endpoint: &Arc<Endpoint<M, H>>,
to: &[Uuid], to: &[Uuid],
@ -274,110 +294,63 @@ impl RpcHelper {
H: StreamingEndpointHandler<M> + 'static, H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static, S: Send + 'static,
{ {
let msg = msg.into_req().map_err(garage_net::error::Error::from)?; // Once quorum is reached, other requests don't matter.
// What we do here is only send the required number of requests
// to reach a quorum, priorizing nodes with the lowest latency.
// When there are errors, we start new requests to compensate.
// TODO: this could be made more aggressive, e.g. if after 2x the
// average ping of a given request, the response is not yet received,
// preemptively send an additional request to any remaining nodes.
// Reorder requests to priorize closeness / low latency
let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied());
let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);
// Build future for each request // Build future for each request
// They are not started now: they are added below in a FuturesUnordered // They are not started now: they are added below in a FuturesUnordered
// object that will take care of polling them (see below) // object that will take care of polling them (see below)
let requests = to.iter().cloned().map(|to| { let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
let mut requests = request_order.into_iter().map(|to| {
let self2 = self.clone(); let self2 = self.clone();
let msg = msg.clone(); let msg = msg.clone();
let endpoint2 = endpoint.clone(); let endpoint2 = endpoint.clone();
(to, async move { async move { self2.call(&endpoint2, to, msg, strategy).await }
self2.call(&endpoint2, to, msg, strategy).await
})
}); });
// Vectors in which success results and errors will be collected // Vectors in which success results and errors will be collected
let mut successes = vec![]; let mut successes = vec![];
let mut errors = vec![]; let mut errors = vec![];
if strategy.rs_interrupt_after_quorum { // resp_stream will contain all of the requests that are currently in flight.
// Case 1: once quorum is reached, other requests don't matter. // (for the moment none, they will be added in the loop below)
// What we do here is only send the required number of requests let mut resp_stream = FuturesUnordered::new();
// to reach a quorum, priorizing nodes with the lowest latency.
// When there are errors, we start new requests to compensate.
// Reorder requests to priorize closeness / low latency // Do some requests and collect results
let request_order = self.request_order(to); while successes.len() < quorum {
let mut ord_requests = vec![(); request_order.len()] // If the current set of requests that are running is not enough to possibly
.into_iter() // reach quorum, start some new requests.
.map(|_| None) while send_all_at_once || successes.len() + resp_stream.len() < quorum {
.collect::<Vec<_>>(); if let Some(fut) = requests.next() {
for (to, fut) in requests { resp_stream.push(fut)
let i = request_order.iter().position(|x| *x == to).unwrap(); } else {
ord_requests[i] = Some((to, fut)); break;
}
// Make an iterator to take requests in their sorted order
let mut requests = ord_requests.into_iter().map(Option::unwrap);
// resp_stream will contain all of the requests that are currently in flight.
// (for the moment none, they will be added in the loop below)
let mut resp_stream = FuturesUnordered::new();
// Do some requests and collect results
'request_loop: while successes.len() < quorum {
// If the current set of requests that are running is not enough to possibly
// reach quorum, start some new requests.
while successes.len() + resp_stream.len() < quorum {
if let Some((req_to, fut)) = requests.next() {
let tracer = opentelemetry::global::tracer("garage");
let span = tracer.start(format!("RPC to {:?}", req_to));
resp_stream.push(tokio::spawn(
fut.with_context(Context::current_with_span(span)),
));
} else {
// If we have no request to add, we know that we won't ever
// reach quorum: bail out now.
break 'request_loop;
}
}
assert!(!resp_stream.is_empty()); // because of loop invariants
// Wait for one request to terminate
match resp_stream.next().await.unwrap().unwrap() {
Ok(msg) => {
successes.push(msg);
}
Err(e) => {
errors.push(e);
}
}
}
} else {
// Case 2: all of the requests need to be sent in all cases,
// and need to terminate. (this is the case for writes that
// must be spread to n nodes)
// Just start all the requests in parallel and return as soon
// as the quorum is reached.
let mut resp_stream = requests
.map(|(_, fut)| fut)
.collect::<FuturesUnordered<_>>();
while let Some(resp) = resp_stream.next().await {
match resp {
Ok(msg) => {
successes.push(msg);
if successes.len() >= quorum {
break;
}
}
Err(e) => {
errors.push(e);
}
} }
} }
if !resp_stream.is_empty() { if successes.len() + resp_stream.len() < quorum {
// Continue remaining requests in background. // We know we won't ever reach quorum
// Note: these requests can get interrupted on process shutdown, break;
// we must not count on them being executed for certain. }
// For all background things that have to happen with certainty,
// they have to be put in a proper queue that is persisted to disk. // Wait for one request to terminate
tokio::spawn(async move { match resp_stream.next().await.unwrap() {
resp_stream.collect::<Vec<Result<_, _>>>().await; Ok(msg) => {
}); successes.push(msg);
}
Err(e) => {
errors.push(e);
}
} }
} }
@ -385,18 +358,174 @@ impl RpcHelper {
Ok(successes) Ok(successes)
} else { } else {
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
Err(Error::Quorum(quorum, successes.len(), to.len(), errors)) Err(Error::Quorum(
quorum,
None,
successes.len(),
to.len(),
errors,
))
} }
} }
pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> { /// Make a RPC call to multiple servers, returning either a Vec of responses,
/// or an error if quorum could not be reached due to too many errors
///
/// Contrary to try_call_many, this fuction is especially made for broadcast
/// write operations. In particular:
///
/// - The request are sent to all specified nodes as soon as `try_write_many_sets`
/// is invoked.
///
/// - When `try_write_many_sets` returns, all remaining requests that haven't
/// completed move to a background task so that they have a chance to
/// complete successfully if there are no failures.
///
/// In addition, the nodes to which requests should be sent are divided in
/// "quorum sets", and `try_write_many_sets` only returns once a quorum
/// has been validated in each set. This is used in the case of cluster layout
/// changes, where data has to be written both in the old layout and in the
/// new one as long as all nodes have not successfully tranisitionned and
/// moved all data to the new layout.
pub async fn try_write_many_sets<M, N, H, S>(
&self,
endpoint: &Arc<Endpoint<M, H>>,
to_sets: &[Vec<Uuid>],
msg: N,
strategy: RequestStrategy,
) -> Result<Vec<S>, Error>
where
M: Rpc<Response = Result<S, Error>> + 'static,
N: IntoReq<M>,
H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static,
{
let quorum = strategy
.rs_quorum
.expect("internal error: missing quorum value in try_write_many_sets");
let tracer = opentelemetry::global::tracer("garage");
let span_name = format!(
"RPC [{}] try_write_many_sets (quorum {} in {} sets)",
endpoint.path(),
quorum,
to_sets.len()
);
let mut span = tracer.start(span_name);
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets)));
span.set_attribute(KeyValue::new("quorum", quorum as i64));
self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum)
.with_context(Context::current_with_span(span))
.await
}
async fn try_write_many_sets_inner<M, N, H, S>(
&self,
endpoint: &Arc<Endpoint<M, H>>,
to_sets: &[Vec<Uuid>],
msg: N,
strategy: RequestStrategy,
quorum: usize,
) -> Result<Vec<S>, Error>
where
M: Rpc<Response = Result<S, Error>> + 'static,
N: IntoReq<M>,
H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static,
{
// Peers may appear in many quorum sets. Here, build a list of peers,
// mapping to the index of the quorum sets in which they appear.
let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum);
// Send one request to each peer of the quorum sets
let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
let requests = result_tracker.nodes.keys().map(|peer| {
let self2 = self.clone();
let msg = msg.clone();
let endpoint2 = endpoint.clone();
let to = *peer;
async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }
});
let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
// Drive requests to completion
while let Some((node, resp)) = resp_stream.next().await {
// Store the response in the correct vector and increment the
// appropriate counters
result_tracker.register_result(node, resp);
// If we have a quorum of ok in all quorum sets, then it's a success!
if result_tracker.all_quorums_ok() {
// Continue all other requets in background
tokio::spawn(async move {
resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
});
return Ok(result_tracker.success_values());
}
// If there is a quorum set for which too many errors were received,
// we know it's impossible to get a quorum, so return immediately.
if result_tracker.too_many_failures() {
break;
}
}
// At this point, there is no quorum and we know that a quorum
// will never be achieved. Currently, we drop all remaining requests.
// Should we still move them to background so that they can continue
// for non-failed nodes? Not doing so has no impact on correctness,
// but it means that more cancellation messages will be sent. Idk.
// (When an in-progress request future is dropped, Netapp automatically
// sends a cancellation message to the remote node to inform it that
// the result is no longer needed. In turn, if the remote node receives
// the cancellation message in time, it interrupts the task of the
// running request handler.)
// Failure, could not get quorum
Err(result_tracker.quorum_error())
}
// ---- functions not related to MAKING RPCs, but just determining to what nodes
// they should be made and in which order ----
pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec<Uuid> {
let layout = self.0.layout.read().unwrap();
let mut ret = Vec::with_capacity(12);
let ver_iter = layout
.versions
.iter()
.rev()
.chain(layout.old_versions.iter().rev());
for ver in ver_iter {
if ver.version > layout.sync_map_min() {
continue;
}
let nodes = ver.nodes_of(position, ver.replication_factor);
for node in rpc_helper.request_order(&layout, nodes) {
if !ret.contains(&node) {
ret.push(node);
}
}
}
ret
}
fn request_order(
&self,
layout: &LayoutHistory,
nodes: impl Iterator<Item = Uuid>,
) -> Vec<Uuid> {
// Retrieve some status variables that we will use to sort requests // Retrieve some status variables that we will use to sort requests
let peer_list = self.0.peering.get_peer_list(); let peer_list = self.0.peering.get_peer_list();
let ring: Arc<Ring> = self.0.ring.borrow().clone(); let our_zone = layout
let our_zone = match ring.layout.node_role(&self.0.our_node_id) { .current()
Some(pc) => &pc.zone, .get_node_zone(&self.0.our_node_id)
None => "", .unwrap_or("");
};
// Augment requests with some information used to sort them. // Augment requests with some information used to sort them.
// The tuples are as follows: // The tuples are as follows:
@ -405,22 +534,18 @@ impl RpcHelper {
// By sorting this vec, we priorize ourself, then nodes in the same zone, // By sorting this vec, we priorize ourself, then nodes in the same zone,
// and within a same zone we priorize nodes with the lowest latency. // and within a same zone we priorize nodes with the lowest latency.
let mut nodes = nodes let mut nodes = nodes
.iter()
.map(|to| { .map(|to| {
let peer_zone = match ring.layout.node_role(to) { let peer_zone = layout.current().get_node_zone(&to).unwrap_or("");
Some(pc) => &pc.zone,
None => "",
};
let peer_avg_ping = peer_list let peer_avg_ping = peer_list
.iter() .iter()
.find(|x| x.id.as_ref() == to.as_slice()) .find(|x| x.id.as_ref() == to.as_slice())
.and_then(|pi| pi.avg_ping) .and_then(|pi| pi.avg_ping)
.unwrap_or_else(|| Duration::from_secs(10)); .unwrap_or_else(|| Duration::from_secs(10));
( (
*to != self.0.our_node_id, to != self.0.our_node_id,
peer_zone != our_zone, peer_zone != our_zone,
peer_avg_ping, peer_avg_ping,
*to, to,
) )
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
@ -434,3 +559,108 @@ impl RpcHelper {
.collect::<Vec<_>>() .collect::<Vec<_>>()
} }
} }
// ------- utility for tracking successes/errors among write sets --------
pub struct QuorumSetResultTracker<S, E> {
/// The set of nodes and the index of the quorum sets they belong to
pub nodes: HashMap<Uuid, Vec<usize>>,
/// The quorum value, i.e. number of success responses to await in each set
pub quorum: usize,
/// The success responses received
pub successes: Vec<(Uuid, S)>,
/// The error responses received
pub failures: Vec<(Uuid, E)>,
/// The counters for successes in each set
pub success_counters: Box<[usize]>,
/// The counters for failures in each set
pub failure_counters: Box<[usize]>,
/// The total number of nodes in each set
pub set_lens: Box<[usize]>,
}
impl<S, E> QuorumSetResultTracker<S, E>
where
E: std::fmt::Display,
{
pub fn new<A>(sets: &[A], quorum: usize) -> Self
where
A: AsRef<[Uuid]>,
{
let mut nodes = HashMap::<Uuid, Vec<usize>>::new();
for (i, set) in sets.iter().enumerate() {
for node in set.as_ref().iter() {
nodes.entry(*node).or_default().push(i);
}
}
let num_nodes = nodes.len();
Self {
nodes,
quorum,
successes: Vec::with_capacity(num_nodes),
failures: vec![],
success_counters: vec![0; sets.len()].into_boxed_slice(),
failure_counters: vec![0; sets.len()].into_boxed_slice(),
set_lens: sets
.iter()
.map(|x| x.as_ref().len())
.collect::<Vec<_>>()
.into_boxed_slice(),
}
}
pub fn register_result(&mut self, node: Uuid, result: Result<S, E>) {
match result {
Ok(s) => {
self.successes.push((node, s));
for set in self.nodes.get(&node).unwrap().iter() {
self.success_counters[*set] += 1;
}
}
Err(e) => {
self.failures.push((node, e));
for set in self.nodes.get(&node).unwrap().iter() {
self.failure_counters[*set] += 1;
}
}
}
}
pub fn all_quorums_ok(&self) -> bool {
self.success_counters
.iter()
.all(|ok_cnt| *ok_cnt >= self.quorum)
}
pub fn too_many_failures(&self) -> bool {
self.failure_counters
.iter()
.zip(self.set_lens.iter())
.any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len)
}
pub fn success_values(self) -> Vec<S> {
self.successes
.into_iter()
.map(|(_, x)| x)
.collect::<Vec<_>>()
}
pub fn quorum_error(self) -> Error {
let errors = self
.failures
.iter()
.map(|(n, e)| format!("{:?}: {}", n, e))
.collect::<Vec<_>>();
Error::Quorum(
self.quorum,
Some(self.set_lens.len()),
self.successes.len(),
self.nodes.len(),
errors,
)
}
}

View file

@ -1,9 +1,9 @@
//! Module containing structs related to membership management //! Module containing structs related to membership management
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::net::{IpAddr, SocketAddr}; use std::net::{IpAddr, SocketAddr};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock, RwLockReadGuard};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use arc_swap::ArcSwapOption; use arc_swap::ArcSwapOption;
@ -12,8 +12,7 @@ use futures::join;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sodiumoxide::crypto::sign::ed25519; use sodiumoxide::crypto::sign::ed25519;
use tokio::select; use tokio::select;
use tokio::sync::watch; use tokio::sync::{watch, Notify};
use tokio::sync::Mutex;
use garage_net::endpoint::{Endpoint, EndpointHandler}; use garage_net::endpoint::{Endpoint, EndpointHandler};
use garage_net::message::*; use garage_net::message::*;
@ -33,9 +32,10 @@ use garage_util::time::*;
use crate::consul::ConsulDiscovery; use crate::consul::ConsulDiscovery;
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
use crate::kubernetes::*; use crate::kubernetes::*;
use crate::layout::*; use crate::layout::{
self, manager::LayoutManager, LayoutHelper, LayoutHistory, NodeRoleV, RpcLayoutDigest,
};
use crate::replication_mode::*; use crate::replication_mode::*;
use crate::ring::*;
use crate::rpc_helper::*; use crate::rpc_helper::*;
use crate::system_metrics::*; use crate::system_metrics::*;
@ -46,10 +46,10 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
/// Version tag used for version check upon Netapp connection. /// Version tag used for version check upon Netapp connection.
/// Cluster nodes with different version tags are deemed /// Cluster nodes with different version tags are deemed
/// incompatible and will refuse to connect. /// incompatible and will refuse to connect.
pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008 pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A
/// RPC endpoint used for calls related to membership /// RPC endpoint used for calls related to membership
pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc";
/// RPC messages related to membership /// RPC messages related to membership
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
@ -58,17 +58,22 @@ pub enum SystemRpc {
Ok, Ok,
/// Request to connect to a specific node (in <pubkey>@<host>:<port> format, pubkey = full-length node ID) /// Request to connect to a specific node (in <pubkey>@<host>:<port> format, pubkey = full-length node ID)
Connect(String), Connect(String),
/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
PullClusterLayout,
/// Advertise Garage status. Answered with another AdvertiseStatus. /// Advertise Garage status. Answered with another AdvertiseStatus.
/// Exchanged with every node on a regular basis. /// Exchanged with every node on a regular basis.
AdvertiseStatus(NodeStatus), AdvertiseStatus(NodeStatus),
/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
AdvertiseClusterLayout(ClusterLayout),
/// Get known nodes states /// Get known nodes states
GetKnownNodes, GetKnownNodes,
/// Return known nodes /// Return known nodes
ReturnKnownNodes(Vec<KnownNodeInfo>), ReturnKnownNodes(Vec<KnownNodeInfo>),
/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
PullClusterLayout,
/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
AdvertiseClusterLayout(LayoutHistory),
/// Ask other node its cluster layout update trackers.
PullClusterLayoutTrackers,
/// Advertisement of cluster layout update trackers.
AdvertiseClusterLayoutTrackers(layout::UpdateTrackers),
} }
impl Rpc for SystemRpc { impl Rpc for SystemRpc {
@ -84,7 +89,6 @@ pub struct System {
/// The id of this node /// The id of this node
pub id: Uuid, pub id: Uuid,
persist_cluster_layout: Persister<ClusterLayout>,
persist_peer_list: Persister<PeerList>, persist_peer_list: Persister<PeerList>,
pub(crate) local_status: RwLock<NodeStatus>, pub(crate) local_status: RwLock<NodeStatus>,
@ -92,9 +96,8 @@ pub struct System {
pub netapp: Arc<NetApp>, pub netapp: Arc<NetApp>,
peering: Arc<PeeringManager>, peering: Arc<PeeringManager>,
pub rpc: RpcHelper,
system_endpoint: Arc<Endpoint<SystemRpc, System>>, pub(crate) system_endpoint: Arc<Endpoint<SystemRpc, System>>,
rpc_listen_addr: SocketAddr, rpc_listen_addr: SocketAddr,
rpc_public_addr: Option<SocketAddr>, rpc_public_addr: Option<SocketAddr>,
@ -105,14 +108,11 @@ pub struct System {
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
kubernetes_discovery: Option<KubernetesDiscoveryConfig>, kubernetes_discovery: Option<KubernetesDiscoveryConfig>,
pub layout_manager: Arc<LayoutManager>,
metrics: ArcSwapOption<SystemMetrics>, metrics: ArcSwapOption<SystemMetrics>,
replication_mode: ReplicationMode, pub(crate) replication_factor: ReplicationFactor,
pub(crate) replication_factor: usize,
/// The ring
pub ring: watch::Receiver<Arc<Ring>>,
update_ring: Mutex<watch::Sender<Arc<Ring>>>,
/// Path to metadata directory /// Path to metadata directory
pub metadata_dir: PathBuf, pub metadata_dir: PathBuf,
@ -123,14 +123,13 @@ pub struct System {
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeStatus { pub struct NodeStatus {
/// Hostname of the node /// Hostname of the node
pub hostname: String, pub hostname: Option<String>,
/// Replication factor configured on the node /// Replication factor configured on the node
pub replication_factor: usize, pub replication_factor: usize,
/// Cluster layout version
pub cluster_layout_version: u64, /// Cluster layout digest
/// Hash of cluster layout staging data pub layout_digest: RpcLayoutDigest,
pub cluster_layout_staging_hash: Hash,
/// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`)
#[serde(default)] #[serde(default)]
@ -243,11 +242,11 @@ impl System {
/// Create this node's membership manager /// Create this node's membership manager
pub fn new( pub fn new(
network_key: NetworkKey, network_key: NetworkKey,
replication_mode: ReplicationMode, replication_factor: ReplicationFactor,
consistency_mode: ConsistencyMode,
config: &Config, config: &Config,
) -> Result<Arc<Self>, Error> { ) -> Result<Arc<Self>, Error> {
let replication_factor = replication_mode.replication_factor(); // ---- setup netapp RPC protocol ----
let node_key = let node_key =
gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
info!( info!(
@ -255,83 +254,39 @@ impl System {
hex::encode(&node_key.public_key()[..8]) hex::encode(&node_key.public_key()[..8])
); );
let persist_cluster_layout: Persister<ClusterLayout> =
Persister::new(&config.metadata_dir, "cluster_layout");
let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
let cluster_layout = match persist_cluster_layout.load() {
Ok(x) => {
if x.replication_factor != replication_factor {
return Err(Error::Message(format!(
"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
x.replication_factor,
replication_factor
)));
}
x
}
Err(e) => {
info!(
"No valid previous cluster layout stored ({}), starting fresh.",
e
);
ClusterLayout::new(replication_factor)
}
};
let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout);
local_status.update_disk_usage(&config.metadata_dir, &config.data_dir);
let ring = Ring::new(cluster_layout, replication_factor);
let (update_ring, ring) = watch::channel(Arc::new(ring));
let rpc_public_addr = match &config.rpc_public_addr {
Some(a_str) => {
use std::net::ToSocketAddrs;
match a_str.to_socket_addrs() {
Err(e) => {
error!(
"Cannot resolve rpc_public_addr {} from config file: {}.",
a_str, e
);
None
}
Ok(a) => {
let a = a.collect::<Vec<_>>();
if a.is_empty() {
error!("rpc_public_addr {} resolve to no known IP address", a_str);
}
if a.len() > 1 {
warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
}
a.into_iter().next()
}
}
}
None => {
let addr =
get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
if let Some(a) = addr {
warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a);
}
addr
}
};
if rpc_public_addr.is_none() {
warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication.");
}
let bind_outgoing_to = Some(config) let bind_outgoing_to = Some(config)
.filter(|x| x.rpc_bind_outgoing) .filter(|x| x.rpc_bind_outgoing)
.map(|x| x.rpc_bind_addr.ip()); .map(|x| x.rpc_bind_addr.ip());
let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key, bind_outgoing_to); let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key, bind_outgoing_to);
let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());
// ---- setup netapp public listener and full mesh peering strategy ----
let rpc_public_addr = get_rpc_public_addr(config);
if rpc_public_addr.is_none() {
warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication.");
}
let peering = PeeringManager::new(netapp.clone(), vec![], rpc_public_addr); let peering = PeeringManager::new(netapp.clone(), vec![], rpc_public_addr);
if let Some(ping_timeout) = config.rpc_ping_timeout_msec { if let Some(ping_timeout) = config.rpc_ping_timeout_msec {
peering.set_ping_timeout_millis(ping_timeout); peering.set_ping_timeout_millis(ping_timeout);
} }
let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
// ---- setup cluster layout and layout manager ----
let layout_manager = LayoutManager::new(
config,
netapp.id,
system_endpoint.clone(),
peering.clone(),
replication_factor,
consistency_mode,
)?;
let mut local_status = NodeStatus::initial(replication_factor, &layout_manager);
local_status.update_disk_usage(&config.metadata_dir, &config.data_dir);
// ---- if enabled, set up additionnal peer discovery methods ----
#[cfg(feature = "consul-discovery")] #[cfg(feature = "consul-discovery")]
let consul_discovery = match &config.consul_discovery { let consul_discovery = match &config.consul_discovery {
Some(cfg) => Some( Some(cfg) => Some(
@ -350,22 +305,15 @@ impl System {
warn!("Kubernetes discovery is not enabled in this build."); warn!("Kubernetes discovery is not enabled in this build.");
} }
// ---- almost done ----
let sys = Arc::new(System { let sys = Arc::new(System {
id: netapp.id.into(), id: netapp.id.into(),
persist_cluster_layout,
persist_peer_list, persist_peer_list,
local_status: RwLock::new(local_status), local_status: RwLock::new(local_status),
node_status: RwLock::new(HashMap::new()), node_status: RwLock::new(HashMap::new()),
netapp: netapp.clone(), netapp: netapp.clone(),
peering: peering.clone(), peering: peering.clone(),
rpc: RpcHelper::new(
netapp.id.into(),
peering,
ring.clone(),
config.rpc_timeout_msec.map(Duration::from_millis),
),
system_endpoint, system_endpoint,
replication_mode,
replication_factor, replication_factor,
rpc_listen_addr: config.rpc_bind_addr, rpc_listen_addr: config.rpc_bind_addr,
rpc_public_addr, rpc_public_addr,
@ -374,10 +322,9 @@ impl System {
consul_discovery, consul_discovery,
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
kubernetes_discovery: config.kubernetes_discovery.clone(), kubernetes_discovery: config.kubernetes_discovery.clone(),
layout_manager,
metrics: ArcSwapOption::new(None), metrics: ArcSwapOption::new(None),
ring,
update_ring: Mutex::new(update_ring),
metadata_dir: config.metadata_dir.clone(), metadata_dir: config.metadata_dir.clone(),
data_dir: config.data_dir.clone(), data_dir: config.data_dir.clone(),
}); });
@ -409,6 +356,20 @@ impl System {
self.metrics.store(None); self.metrics.store(None);
} }
// ---- Public utilities / accessors ----
pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHelper> {
self.layout_manager.layout()
}
pub fn layout_notify(&self) -> Arc<Notify> {
self.layout_manager.change_notify.clone()
}
pub fn rpc_helper(&self) -> &RpcHelper {
&self.layout_manager.rpc_helper
}
// ---- Administrative operations (directly available and // ---- Administrative operations (directly available and
// also available through RPC) ---- // also available through RPC) ----
@ -435,18 +396,6 @@ impl System {
known_nodes known_nodes
} }
pub fn get_cluster_layout(&self) -> ClusterLayout {
self.ring.borrow().layout.clone()
}
pub async fn update_cluster_layout(
self: &Arc<Self>,
layout: &ClusterLayout,
) -> Result<(), Error> {
self.handle_advertise_cluster_layout(layout).await?;
Ok(())
}
pub async fn connect(&self, node: &str) -> Result<(), Error> { pub async fn connect(&self, node: &str) -> Result<(), Error> {
let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node) let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node)
.await .await
@ -476,47 +425,65 @@ impl System {
} }
pub fn health(&self) -> ClusterHealth { pub fn health(&self) -> ClusterHealth {
let ring: Arc<_> = self.ring.borrow().clone(); let quorum = self
let quorum = self.replication_mode.write_quorum(); .replication_factor
let replication_factor = self.replication_factor; .write_quorum(ConsistencyMode::Consistent);
// Gather information about running nodes.
// Technically, `nodes` contains currently running nodes, as well
// as nodes that this Garage process has been connected to at least
// once since it started.
let nodes = self let nodes = self
.get_known_nodes() .get_known_nodes()
.into_iter() .into_iter()
.map(|n| (n.id, n)) .map(|n| (n.id, n))
.collect::<HashMap<Uuid, _>>(); .collect::<HashMap<Uuid, _>>();
let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count();
let node_up = |x: &Uuid| nodes.get(x).map(|n| n.is_up).unwrap_or(false);
let storage_nodes = ring // Acquire a rwlock read-lock to the current cluster layout
.layout let layout = self.cluster_layout();
.roles
.items()
.iter()
.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some()))
.collect::<Vec<_>>();
let storage_nodes_ok = storage_nodes
.iter()
.filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
.count();
let partitions = ring.partitions(); // Obtain information about nodes that have a role as storage nodes
let partitions_n_up = partitions // in one of the active layout versions
.iter() let mut storage_nodes = HashSet::<Uuid>::with_capacity(16);
.map(|(_, h)| { for ver in layout.versions.iter() {
let pn = ring.get_nodes(h, ring.replication_factor); storage_nodes.extend(
pn.iter() ver.roles
.filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .items()
.count() .iter()
}) .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some()))
.collect::<Vec<usize>>(); .map(|(n, _, _)| *n),
let partitions_all_ok = partitions_n_up )
.iter() }
.filter(|c| **c == replication_factor) let storage_nodes_ok = storage_nodes.iter().filter(|x| node_up(x)).count();
.count();
let partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count();
// Determine the number of partitions that have:
// - a quorum of up nodes for all write sets (i.e. are available)
// - for which all nodes in all write sets are up (i.e. are fully healthy)
let partitions = layout.current().partitions().collect::<Vec<_>>();
let mut partitions_quorum = 0;
let mut partitions_all_ok = 0;
for (_, hash) in partitions.iter() {
let mut write_sets = layout
.versions
.iter()
.map(|x| x.nodes_of(hash, x.replication_factor));
let has_quorum = write_sets
.clone()
.all(|set| set.filter(|x| node_up(x)).count() >= quorum);
let all_ok = write_sets.all(|mut set| set.all(|x| node_up(&x)));
if has_quorum {
partitions_quorum += 1;
}
if all_ok {
partitions_all_ok += 1;
}
}
// Determine overall cluster status
let status = let status =
if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() { if partitions_all_ok == partitions.len() && storage_nodes_ok == storage_nodes.len() {
ClusterHealthStatus::Healthy ClusterHealthStatus::Healthy
} else if partitions_quorum == partitions.len() { } else if partitions_quorum == partitions.len() {
ClusterHealthStatus::Degraded ClusterHealthStatus::Degraded
@ -553,7 +520,7 @@ impl System {
} }
}; };
let hostname = self.local_status.read().unwrap().hostname.clone(); let hostname = self.local_status.read().unwrap().hostname.clone().unwrap();
if let Err(e) = c if let Err(e) = c
.publish_consul_service(self.netapp.id, &hostname, rpc_public_addr) .publish_consul_service(self.netapp.id, &hostname, rpc_public_addr)
.await .await
@ -577,30 +544,16 @@ impl System {
} }
}; };
let hostname = self.local_status.read().unwrap().hostname.clone(); let hostname = self.local_status.read().unwrap().hostname.clone().unwrap();
if let Err(e) = publish_kubernetes_node(k, self.netapp.id, &hostname, rpc_public_addr).await if let Err(e) = publish_kubernetes_node(k, self.netapp.id, &hostname, rpc_public_addr).await
{ {
error!("Error while publishing node to Kubernetes: {}", e); error!("Error while publishing node to Kubernetes: {}", e);
} }
} }
/// Save network configuration to disc
async fn save_cluster_layout(&self) -> Result<(), Error> {
let ring: Arc<Ring> = self.ring.borrow().clone();
self.persist_cluster_layout
.save_async(&ring.layout)
.await
.expect("Cannot save current cluster layout");
Ok(())
}
fn update_local_status(&self) { fn update_local_status(&self) {
let mut local_status = self.local_status.write().unwrap(); let mut local_status = self.local_status.write().unwrap();
local_status.layout_digest = self.layout_manager.layout().digest();
let ring = self.ring.borrow();
local_status.cluster_layout_version = ring.layout.version;
local_status.cluster_layout_staging_hash = ring.layout.staging_hash;
local_status.update_disk_usage(&self.metadata_dir, &self.data_dir); local_status.update_disk_usage(&self.metadata_dir, &self.data_dir);
} }
@ -611,11 +564,6 @@ impl System {
Ok(SystemRpc::Ok) Ok(SystemRpc::Ok)
} }
fn handle_pull_cluster_layout(&self) -> SystemRpc {
let ring = self.ring.borrow().clone();
SystemRpc::AdvertiseClusterLayout(ring.layout.clone())
}
fn handle_get_known_nodes(&self) -> SystemRpc { fn handle_get_known_nodes(&self) -> SystemRpc {
let known_nodes = self.get_known_nodes(); let known_nodes = self.get_known_nodes();
SystemRpc::ReturnKnownNodes(known_nodes) SystemRpc::ReturnKnownNodes(known_nodes)
@ -635,11 +583,8 @@ impl System {
std::process::exit(1); std::process::exit(1);
} }
if info.cluster_layout_version > local_info.cluster_layout_version self.layout_manager
|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash .handle_advertise_status(from, &info.layout_digest);
{
tokio::spawn(self.clone().pull_cluster_layout(from));
}
drop(local_info); drop(local_info);
@ -651,57 +596,6 @@ impl System {
Ok(SystemRpc::Ok) Ok(SystemRpc::Ok)
} }
async fn handle_advertise_cluster_layout(
self: &Arc<Self>,
adv: &ClusterLayout,
) -> Result<SystemRpc, Error> {
if adv.replication_factor != self.replication_factor {
let msg = format!(
"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
adv.replication_factor,
self.replication_factor
);
error!("{}", msg);
return Err(Error::Message(msg));
}
let update_ring = self.update_ring.lock().await;
let mut layout: ClusterLayout = self.ring.borrow().layout.clone();
let prev_layout_check = layout.check().is_ok();
if layout.merge(adv) {
if prev_layout_check && layout.check().is_err() {
error!("New cluster layout is invalid, discarding.");
return Err(Error::Message(
"New cluster layout is invalid, discarding.".into(),
));
}
let ring = Ring::new(layout.clone(), self.replication_factor);
update_ring.send(Arc::new(ring))?;
drop(update_ring);
let self2 = self.clone();
tokio::spawn(async move {
if let Err(e) = self2
.rpc
.broadcast(
&self2.system_endpoint,
SystemRpc::AdvertiseClusterLayout(layout),
RequestStrategy::with_priority(PRIO_HIGH),
)
.await
{
warn!("Error while broadcasting new cluster layout: {}", e);
}
});
self.save_cluster_layout().await?;
}
Ok(SystemRpc::Ok)
}
async fn status_exchange_loop(&self, mut stop_signal: watch::Receiver<bool>) { async fn status_exchange_loop(&self, mut stop_signal: watch::Receiver<bool>) {
while !*stop_signal.borrow() { while !*stop_signal.borrow() {
let restart_at = Instant::now() + STATUS_EXCHANGE_INTERVAL; let restart_at = Instant::now() + STATUS_EXCHANGE_INTERVAL;
@ -711,7 +605,7 @@ impl System {
let local_status: NodeStatus = self.local_status.read().unwrap().clone(); let local_status: NodeStatus = self.local_status.read().unwrap().clone();
let _ = self let _ = self
.rpc .rpc_helper()
.broadcast( .broadcast(
&self.system_endpoint, &self.system_endpoint,
SystemRpc::AdvertiseStatus(local_status), SystemRpc::AdvertiseStatus(local_status),
@ -736,10 +630,9 @@ impl System {
.filter(|p| p.is_up()) .filter(|p| p.is_up())
.count(); .count();
let not_configured = self.ring.borrow().layout.check().is_err(); let not_configured = self.cluster_layout().check().is_err();
let no_peers = n_connected < self.replication_factor; let no_peers = n_connected < self.replication_factor.into();
let expected_n_nodes = self.cluster_layout().all_nodes().len();
let expected_n_nodes = self.ring.borrow().layout.num_nodes();
let bad_peers = n_connected != expected_n_nodes; let bad_peers = n_connected != expected_n_nodes;
if not_configured || no_peers || bad_peers { if not_configured || no_peers || bad_peers {
@ -791,8 +684,8 @@ impl System {
// If the layout is configured, and we already have some connections // If the layout is configured, and we already have some connections
// to other nodes in the cluster, we can skip trying to connect to // to other nodes in the cluster, we can skip trying to connect to
// nodes that are not in the cluster layout. // nodes that are not in the cluster layout.
let ring = self.ring.borrow(); let layout = self.cluster_layout();
ping_list.retain(|(id, _)| ring.layout.node_ids().contains(&(*id).into())); ping_list.retain(|(id, _)| layout.all_nodes().contains(&(*id).into()));
} }
for (node_id, node_addr) in ping_list { for (node_id, node_addr) in ping_list {
@ -847,48 +740,49 @@ impl System {
.save_async(&PeerList(peer_list)) .save_async(&PeerList(peer_list))
.await .await
} }
async fn pull_cluster_layout(self: Arc<Self>, peer: Uuid) {
let resp = self
.rpc
.call(
&self.system_endpoint,
peer,
SystemRpc::PullClusterLayout,
RequestStrategy::with_priority(PRIO_HIGH),
)
.await;
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await;
}
}
} }
#[async_trait] #[async_trait]
impl EndpointHandler<SystemRpc> for System { impl EndpointHandler<SystemRpc> for System {
async fn handle(self: &Arc<Self>, msg: &SystemRpc, from: NodeID) -> Result<SystemRpc, Error> { async fn handle(self: &Arc<Self>, msg: &SystemRpc, from: NodeID) -> Result<SystemRpc, Error> {
match msg { match msg {
// ---- system functions -> System ----
SystemRpc::Connect(node) => self.handle_connect(node).await, SystemRpc::Connect(node) => self.handle_connect(node).await,
SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()),
SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await, SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await,
SystemRpc::AdvertiseClusterLayout(adv) => {
self.clone().handle_advertise_cluster_layout(adv).await
}
SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()),
// ---- layout functions -> LayoutManager ----
SystemRpc::PullClusterLayout => Ok(self.layout_manager.handle_pull_cluster_layout()),
SystemRpc::AdvertiseClusterLayout(adv) => {
self.layout_manager
.handle_advertise_cluster_layout(adv)
.await
}
SystemRpc::PullClusterLayoutTrackers => {
Ok(self.layout_manager.handle_pull_cluster_layout_trackers())
}
SystemRpc::AdvertiseClusterLayoutTrackers(adv) => {
self.layout_manager
.handle_advertise_cluster_layout_trackers(adv)
.await
}
// ---- other -> Error ----
m => Err(Error::unexpected_rpc_message(m)), m => Err(Error::unexpected_rpc_message(m)),
} }
} }
} }
impl NodeStatus { impl NodeStatus {
fn initial(replication_factor: usize, layout: &ClusterLayout) -> Self { fn initial(replication_factor: ReplicationFactor, layout_manager: &LayoutManager) -> Self {
NodeStatus { NodeStatus {
hostname: gethostname::gethostname() hostname: Some(
.into_string() gethostname::gethostname()
.unwrap_or_else(|_| "<invalid utf-8>".to_string()), .into_string()
replication_factor, .unwrap_or_else(|_| "<invalid utf-8>".to_string()),
cluster_layout_version: layout.version, ),
cluster_layout_staging_hash: layout.staging_hash, replication_factor: replication_factor.into(),
layout_digest: layout_manager.layout().digest(),
meta_disk_avail: None, meta_disk_avail: None,
data_disk_avail: None, data_disk_avail: None,
} }
@ -896,10 +790,9 @@ impl NodeStatus {
fn unknown() -> Self { fn unknown() -> Self {
NodeStatus { NodeStatus {
hostname: "?".to_string(), hostname: None,
replication_factor: 0, replication_factor: 0,
cluster_layout_version: 0, layout_digest: Default::default(),
cluster_layout_staging_hash: Hash::from([0u8; 32]),
meta_disk_avail: None, meta_disk_avail: None,
data_disk_avail: None, data_disk_avail: None,
} }
@ -952,6 +845,40 @@ fn get_default_ip() -> Option<IpAddr> {
.map(|a| a.ip()) .map(|a| a.ip())
} }
fn get_rpc_public_addr(config: &Config) -> Option<SocketAddr> {
match &config.rpc_public_addr {
Some(a_str) => {
use std::net::ToSocketAddrs;
match a_str.to_socket_addrs() {
Err(e) => {
error!(
"Cannot resolve rpc_public_addr {} from config file: {}.",
a_str, e
);
None
}
Ok(a) => {
let a = a.collect::<Vec<_>>();
if a.is_empty() {
error!("rpc_public_addr {} resolve to no known IP address", a_str);
}
if a.len() > 1 {
warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
}
a.into_iter().next()
}
}
}
None => {
let addr = get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
if let Some(a) = addr {
warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a);
}
addr
}
}
}
async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> { async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
let mut ret = vec![]; let mut ret = vec![];

View file

@ -3,7 +3,6 @@ use std::time::{Duration, Instant};
use opentelemetry::{global, metrics::*, KeyValue}; use opentelemetry::{global, metrics::*, KeyValue};
use crate::ring::Ring;
use crate::system::{ClusterHealthStatus, System}; use crate::system::{ClusterHealthStatus, System};
/// TableMetrics reference all counter used for metrics /// TableMetrics reference all counter used for metrics
@ -69,7 +68,7 @@ impl SystemMetrics {
let replication_factor = system.replication_factor; let replication_factor = system.replication_factor;
meter meter
.u64_value_observer("garage_replication_factor", move |observer| { .u64_value_observer("garage_replication_factor", move |observer| {
observer.observe(replication_factor as u64, &[]) observer.observe(replication_factor.replication_factor() as u64, &[])
}) })
.with_description("Garage replication factor setting") .with_description("Garage replication factor setting")
.init() .init()
@ -215,14 +214,14 @@ impl SystemMetrics {
let system = system.clone(); let system = system.clone();
meter meter
.u64_value_observer("cluster_layout_node_connected", move |observer| { .u64_value_observer("cluster_layout_node_connected", move |observer| {
let ring: Arc<Ring> = system.ring.borrow().clone(); let layout = system.cluster_layout();
let nodes = system.get_known_nodes(); let nodes = system.get_known_nodes();
for (id, _, config) in ring.layout.roles.items().iter() { for id in layout.all_nodes().iter() {
if let Some(role) = &config.0 { let mut kv = vec![KeyValue::new("id", format!("{:?}", id))];
let mut kv = vec![ if let Some(role) =
KeyValue::new("id", format!("{:?}", id)), layout.current().roles.get(id).and_then(|r| r.0.as_ref())
KeyValue::new("role_zone", role.zone.clone()), {
]; kv.push(KeyValue::new("role_zone", role.zone.clone()));
match role.capacity { match role.capacity {
Some(cap) => { Some(cap) => {
kv.push(KeyValue::new("role_capacity", cap as i64)); kv.push(KeyValue::new("role_capacity", cap as i64));
@ -232,24 +231,24 @@ impl SystemMetrics {
kv.push(KeyValue::new("role_gateway", 1)); kv.push(KeyValue::new("role_gateway", 1));
} }
} }
}
let value; let value;
if let Some(node) = nodes.iter().find(|n| n.id == *id) { if let Some(node) = nodes.iter().find(|n| n.id == *id) {
value = if node.is_up { 1 } else { 0 };
// TODO: if we add address and hostname, and those change, we // TODO: if we add address and hostname, and those change, we
// get duplicate metrics, due to bad otel aggregation :( // get duplicate metrics, due to bad otel aggregation :(
// Can probably be fixed when we upgrade opentelemetry // Can probably be fixed when we upgrade opentelemetry
// kv.push(KeyValue::new("address", node.addr.to_string())); // kv.push(KeyValue::new("address", node.addr.to_string()));
// kv.push(KeyValue::new( // kv.push(KeyValue::new(
// "hostname", // "hostname",
// node.status.hostname.clone(), // node.status.hostname.clone(),
// )); // ));
} else { value = if node.is_up { 1 } else { 0 };
value = 0; } else {
} value = 0;
observer.observe(value, &kv);
} }
observer.observe(value, &kv);
} }
}) })
.with_description("Connection status for nodes in the cluster layout") .with_description("Connection status for nodes in the cluster layout")
@ -259,14 +258,14 @@ impl SystemMetrics {
let system = system.clone(); let system = system.clone();
meter meter
.u64_value_observer("cluster_layout_node_disconnected_time", move |observer| { .u64_value_observer("cluster_layout_node_disconnected_time", move |observer| {
let ring: Arc<Ring> = system.ring.borrow().clone(); let layout = system.cluster_layout();
let nodes = system.get_known_nodes(); let nodes = system.get_known_nodes();
for (id, _, config) in ring.layout.roles.items().iter() { for id in layout.all_nodes().iter() {
if let Some(role) = &config.0 { let mut kv = vec![KeyValue::new("id", format!("{:?}", id))];
let mut kv = vec![ if let Some(role) =
KeyValue::new("id", format!("{:?}", id)), layout.current().roles.get(id).and_then(|r| r.0.as_ref())
KeyValue::new("role_zone", role.zone.clone()), {
]; kv.push(KeyValue::new("role_zone", role.zone.clone()));
match role.capacity { match role.capacity {
Some(cap) => { Some(cap) => {
kv.push(KeyValue::new("role_capacity", cap as i64)); kv.push(KeyValue::new("role_capacity", cap as i64));
@ -276,19 +275,19 @@ impl SystemMetrics {
kv.push(KeyValue::new("role_gateway", 1)); kv.push(KeyValue::new("role_gateway", 1));
} }
} }
}
if let Some(node) = nodes.iter().find(|n| n.id == *id) { if let Some(node) = nodes.iter().find(|n| n.id == *id) {
// TODO: see comment above // TODO: see comment above
// kv.push(KeyValue::new("address", node.addr.to_string())); // kv.push(KeyValue::new("address", node.addr.to_string()));
// kv.push(KeyValue::new( // kv.push(KeyValue::new(
// "hostname", // "hostname",
// node.status.hostname.clone(), // node.status.hostname.clone(),
// )); // ));
if node.is_up { if node.is_up {
observer.observe(0, &kv); observer.observe(0, &kv);
} else if let Some(secs) = node.last_seen_secs_ago { } else if let Some(secs) = node.last_seen_secs_ago {
observer.observe(secs, &kv); observer.observe(secs, &kv);
}
} }
} }
} }

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_table" name = "garage_table"
version = "0.9.3" version = "0.10.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"

View file

@ -6,7 +6,6 @@ use serde_bytes::ByteBuf;
use tokio::sync::Notify; use tokio::sync::Notify;
use garage_db as db; use garage_db as db;
use garage_db::counted_tree_hack::CountedTree;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
@ -36,7 +35,7 @@ pub struct TableData<F: TableSchema, R: TableReplication> {
pub(crate) insert_queue: db::Tree, pub(crate) insert_queue: db::Tree,
pub(crate) insert_queue_notify: Arc<Notify>, pub(crate) insert_queue_notify: Arc<Notify>,
pub(crate) gc_todo: CountedTree, pub(crate) gc_todo: db::Tree,
pub(crate) metrics: TableMetrics, pub(crate) metrics: TableMetrics,
} }
@ -61,7 +60,6 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
let gc_todo = db let gc_todo = db
.open_tree(format!("{}:gc_todo_v2", F::TABLE_NAME)) .open_tree(format!("{}:gc_todo_v2", F::TABLE_NAME))
.expect("Unable to open GC DB tree"); .expect("Unable to open GC DB tree");
let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");
let metrics = TableMetrics::new( let metrics = TableMetrics::new(
F::TABLE_NAME, F::TABLE_NAME,
@ -254,7 +252,8 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
// of the GC algorithm, as in all cases GC is suspended if // of the GC algorithm, as in all cases GC is suspended if
// any node of the partition is unavailable. // any node of the partition is unavailable.
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap(); let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
let nodes = self.replication.write_nodes(&pk_hash); // TODO: this probably breaks when the layout changes
let nodes = self.replication.storage_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) { if nodes.first() == Some(&self.system.id) {
GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?; GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
} }
@ -369,6 +368,6 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
} }
pub fn gc_todo_len(&self) -> Result<usize, Error> { pub fn gc_todo_len(&self) -> Result<usize, Error> {
Ok(self.gc_todo.len()) Ok(self.gc_todo.len()?)
} }
} }

View file

@ -10,7 +10,7 @@ use serde_bytes::ByteBuf;
use futures::future::join_all; use futures::future::join_all;
use tokio::sync::watch; use tokio::sync::watch;
use garage_db::counted_tree_hack::CountedTree; use garage_db as db;
use garage_util::background::*; use garage_util::background::*;
use garage_util::data::*; use garage_util::data::*;
@ -152,7 +152,7 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
let mut partitions = HashMap::new(); let mut partitions = HashMap::new();
for entry in entries { for entry in entries {
let pkh = Hash::try_from(&entry.key[..32]).unwrap(); let pkh = Hash::try_from(&entry.key[..32]).unwrap();
let mut nodes = self.data.replication.write_nodes(&pkh); let mut nodes = self.data.replication.storage_nodes(&pkh);
nodes.retain(|x| *x != self.system.id); nodes.retain(|x| *x != self.system.id);
nodes.sort(); nodes.sort();
@ -227,10 +227,10 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
// GC'ing is not a critical function of the system, so it's not a big // GC'ing is not a critical function of the system, so it's not a big
// deal if we can't do it right now. // deal if we can't do it right now.
self.system self.system
.rpc .rpc_helper()
.try_call_many( .try_call_many(
&self.endpoint, &self.endpoint,
&nodes[..], &nodes,
GcRpc::Update(updates), GcRpc::Update(updates),
RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
) )
@ -248,10 +248,10 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
// it means that the garbage collection wasn't completed and has // it means that the garbage collection wasn't completed and has
// to be retried later. // to be retried later.
self.system self.system
.rpc .rpc_helper()
.try_call_many( .try_call_many(
&self.endpoint, &self.endpoint,
&nodes[..], &nodes,
GcRpc::DeleteIfEqualHash(deletes), GcRpc::DeleteIfEqualHash(deletes),
RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
) )
@ -334,9 +334,9 @@ impl<F: TableSchema, R: TableReplication> Worker for GcWorker<F, R> {
} }
} }
/// An entry stored in the gc_todo Sled tree associated with the table /// An entry stored in the gc_todo db tree associated with the table
/// Contains helper function for parsing, saving, and removing /// Contains helper function for parsing, saving, and removing
/// such entry in Sled /// such entry in the db
/// ///
/// Format of an entry: /// Format of an entry:
/// - key = 8 bytes: timestamp of tombstone /// - key = 8 bytes: timestamp of tombstone
@ -353,7 +353,7 @@ pub(crate) struct GcTodoEntry {
} }
impl GcTodoEntry { impl GcTodoEntry {
/// Creates a new GcTodoEntry (not saved in Sled) from its components: /// Creates a new GcTodoEntry (not saved in the db) from its components:
/// the key of an entry in the table, and the hash of the associated /// the key of an entry in the table, and the hash of the associated
/// serialized value /// serialized value
pub(crate) fn new(key: Vec<u8>, value_hash: Hash) -> Self { pub(crate) fn new(key: Vec<u8>, value_hash: Hash) -> Self {
@ -376,7 +376,7 @@ impl GcTodoEntry {
} }
/// Saves the GcTodoEntry in the gc_todo tree /// Saves the GcTodoEntry in the gc_todo tree
pub(crate) fn save(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { pub(crate) fn save(&self, gc_todo_tree: &db::Tree) -> Result<(), Error> {
gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?; gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?;
Ok(()) Ok(())
} }
@ -386,12 +386,14 @@ impl GcTodoEntry {
/// This is usefull to remove a todo entry only under the condition /// This is usefull to remove a todo entry only under the condition
/// that it has not changed since the time it was read, i.e. /// that it has not changed since the time it was read, i.e.
/// what we have to do is still the same /// what we have to do is still the same
pub(crate) fn remove_if_equal(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { pub(crate) fn remove_if_equal(&self, gc_todo_tree: &db::Tree) -> Result<(), Error> {
gc_todo_tree.compare_and_swap::<_, _, &[u8]>( gc_todo_tree.db().transaction(|txn| {
&self.todo_table_key(), let key = self.todo_table_key();
Some(self.value_hash), if txn.get(gc_todo_tree, &key)?.as_deref() == Some(self.value_hash.as_slice()) {
None, txn.remove(gc_todo_tree, &key)?;
)?; }
Ok(())
})?;
Ok(()) Ok(())
} }

View file

@ -13,7 +13,7 @@ use garage_util::data::*;
use garage_util::encode::{nonversioned_decode, nonversioned_encode}; use garage_util::encode::{nonversioned_decode, nonversioned_encode};
use garage_util::error::Error; use garage_util::error::Error;
use garage_rpc::ring::*; use garage_rpc::layout::*;
use crate::data::*; use crate::data::*;
use crate::replication::*; use crate::replication::*;
@ -31,14 +31,14 @@ pub struct MerkleUpdater<F: TableSchema, R: TableReplication> {
// - value = the hash of the full serialized item, if present, // - value = the hash of the full serialized item, if present,
// or an empty vec if item is absent (deleted) // or an empty vec if item is absent (deleted)
// Fields in data: // Fields in data:
// pub(crate) merkle_todo: sled::Tree, // pub(crate) merkle_todo: db::Tree,
// pub(crate) merkle_todo_notify: Notify, // pub(crate) merkle_todo_notify: Notify,
// Content of the merkle tree: items where // Content of the merkle tree: items where
// - key = .bytes() for MerkleNodeKey // - key = .bytes() for MerkleNodeKey
// - value = serialization of a MerkleNode, assumed to be MerkleNode::empty if not found // - value = serialization of a MerkleNode, assumed to be MerkleNode::empty if not found
// Field in data: // Field in data:
// pub(crate) merkle_tree: sled::Tree, // pub(crate) merkle_tree: db::Tree,
empty_node_hash: Hash, empty_node_hash: Hash,
} }
@ -291,10 +291,6 @@ impl<F: TableSchema, R: TableReplication> MerkleUpdater<F, R> {
Ok(self.data.merkle_tree.len()?) Ok(self.data.merkle_tree.len()?)
} }
pub fn merkle_tree_fast_len(&self) -> Result<Option<usize>, Error> {
Ok(self.data.merkle_tree.fast_len()?)
}
pub fn todo_len(&self) -> Result<usize, Error> { pub fn todo_len(&self) -> Result<usize, Error> {
Ok(self.data.merkle_todo.len()?) Ok(self.data.merkle_todo.len()?)
} }

View file

@ -1,7 +1,6 @@
use opentelemetry::{global, metrics::*, KeyValue}; use opentelemetry::{global, metrics::*, KeyValue};
use garage_db as db; use garage_db as db;
use garage_db::counted_tree_hack::CountedTree;
/// TableMetrics reference all counter used for metrics /// TableMetrics reference all counter used for metrics
pub struct TableMetrics { pub struct TableMetrics {
@ -27,7 +26,7 @@ impl TableMetrics {
store: db::Tree, store: db::Tree,
merkle_tree: db::Tree, merkle_tree: db::Tree,
merkle_todo: db::Tree, merkle_todo: db::Tree,
gc_todo: CountedTree, gc_todo: db::Tree,
) -> Self { ) -> Self {
let meter = global::meter(table_name); let meter = global::meter(table_name);
TableMetrics { TableMetrics {
@ -35,9 +34,9 @@ impl TableMetrics {
.u64_value_observer( .u64_value_observer(
"table.size", "table.size",
move |observer| { move |observer| {
if let Ok(Some(v)) = store.fast_len() { if let Ok(value) = store.len() {
observer.observe( observer.observe(
v as u64, value as u64,
&[KeyValue::new("table_name", table_name)], &[KeyValue::new("table_name", table_name)],
); );
} }
@ -49,9 +48,9 @@ impl TableMetrics {
.u64_value_observer( .u64_value_observer(
"table.merkle_tree_size", "table.merkle_tree_size",
move |observer| { move |observer| {
if let Ok(Some(v)) = merkle_tree.fast_len() { if let Ok(value) = merkle_tree.len() {
observer.observe( observer.observe(
v as u64, value as u64,
&[KeyValue::new("table_name", table_name)], &[KeyValue::new("table_name", table_name)],
); );
} }
@ -77,10 +76,12 @@ impl TableMetrics {
.u64_value_observer( .u64_value_observer(
"table.gc_todo_queue_length", "table.gc_todo_queue_length",
move |observer| { move |observer| {
observer.observe( if let Ok(value) = gc_todo.len() {
gc_todo.len() as u64, observer.observe(
&[KeyValue::new("table_name", table_name)], value as u64,
); &[KeyValue::new("table_name", table_name)],
);
}
}, },
) )
.with_description("Table garbage collector TODO queue length") .with_description("Table garbage collector TODO queue length")

View file

@ -1,24 +1,36 @@
use std::sync::Arc; use std::sync::Arc;
use garage_rpc::ring::*; use garage_rpc::layout::*;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_util::data::*; use garage_util::data::*;
use crate::replication::*; use crate::replication::*;
// TODO: find a way to track layout changes for this as well
// The hard thing is that this data is stored also on gateway nodes,
// whereas sharded data is stored only on non-Gateway nodes (storage nodes)
// Also we want to be more tolerant to failures of gateways so we don't
// want to do too much holding back of data when progress of gateway
// nodes is not reported in the layout history's ack/sync/sync_ack maps.
/// Full replication schema: all nodes store everything /// Full replication schema: all nodes store everything
/// Writes are disseminated in an epidemic manner in the network
/// Advantage: do all reads locally, extremely fast /// Advantage: do all reads locally, extremely fast
/// Inconvenient: only suitable to reasonably small tables /// Inconvenient: only suitable to reasonably small tables
/// Inconvenient: if some writes fail, nodes will read outdated data
#[derive(Clone)] #[derive(Clone)]
pub struct TableFullReplication { pub struct TableFullReplication {
/// The membership manager of this node /// The membership manager of this node
pub system: Arc<System>, pub system: Arc<System>,
/// Max number of faults allowed while replicating a record
pub max_faults: usize,
} }
impl TableReplication for TableFullReplication { impl TableReplication for TableFullReplication {
type WriteSets = Vec<Vec<Uuid>>;
fn storage_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
let layout = self.system.cluster_layout();
layout.current().all_nodes().to_vec()
}
fn read_nodes(&self, _hash: &Hash) -> Vec<Uuid> { fn read_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
vec![self.system.id] vec![self.system.id]
} }
@ -26,26 +38,36 @@ impl TableReplication for TableFullReplication {
1 1
} }
fn write_nodes(&self, _hash: &Hash) -> Vec<Uuid> { fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
let ring = self.system.ring.borrow(); vec![self.storage_nodes(hash)]
ring.layout.node_ids().to_vec()
} }
fn write_quorum(&self) -> usize { fn write_quorum(&self) -> usize {
let nmembers = self.system.ring.borrow().layout.node_ids().len(); let nmembers = self.system.cluster_layout().current().all_nodes().len();
if nmembers > self.max_faults {
nmembers - self.max_faults let max_faults = if nmembers > 1 { 1 } else { 0 };
if nmembers > max_faults {
nmembers - max_faults
} else { } else {
1 1
} }
} }
fn max_write_errors(&self) -> usize {
self.max_faults
}
fn partition_of(&self, _hash: &Hash) -> Partition { fn partition_of(&self, _hash: &Hash) -> Partition {
0u16 0u16
} }
fn partitions(&self) -> Vec<(Partition, Hash)> {
vec![(0u16, [0u8; 32].into())] fn sync_partitions(&self) -> SyncPartitions {
let layout = self.system.cluster_layout();
let layout_version = layout.current().version;
SyncPartitions {
layout_version,
partitions: vec![SyncPartition {
partition: 0u16,
first_hash: [0u8; 32].into(),
last_hash: [0xff; 32].into(),
storage_sets: vec![layout.current().all_nodes().to_vec()],
}],
}
} }
} }

View file

@ -1,25 +1,43 @@
use garage_rpc::ring::*; use garage_rpc::layout::*;
use garage_util::data::*; use garage_util::data::*;
/// Trait to describe how a table shall be replicated /// Trait to describe how a table shall be replicated
pub trait TableReplication: Send + Sync + 'static { pub trait TableReplication: Send + Sync + 'static {
type WriteSets: AsRef<Vec<Vec<Uuid>>> + AsMut<Vec<Vec<Uuid>>> + Send + Sync + 'static;
// See examples in table_sharded.rs and table_fullcopy.rs // See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods // To understand various replication methods
/// The entire list of all nodes that store a partition
fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid>;
/// Which nodes to send read requests to /// Which nodes to send read requests to
fn read_nodes(&self, hash: &Hash) -> Vec<Uuid>; fn read_nodes(&self, hash: &Hash) -> Vec<Uuid>;
/// Responses needed to consider a read succesfull /// Responses needed to consider a read succesfull
fn read_quorum(&self) -> usize; fn read_quorum(&self) -> usize;
/// Which nodes to send writes to /// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash) -> Vec<Uuid>; fn write_sets(&self, hash: &Hash) -> Self::WriteSets;
/// Responses needed to consider a write succesfull /// Responses needed to consider a write succesfull in each set
fn write_quorum(&self) -> usize; fn write_quorum(&self) -> usize;
fn max_write_errors(&self) -> usize;
// Accessing partitions, for Merkle tree & sync // Accessing partitions, for Merkle tree & sync
/// Get partition for data with given hash /// Get partition for data with given hash
fn partition_of(&self, hash: &Hash) -> Partition; fn partition_of(&self, hash: &Hash) -> Partition;
/// List of existing partitions /// List of partitions and nodes to sync with in current layout
fn partitions(&self) -> Vec<(Partition, Hash)>; fn sync_partitions(&self) -> SyncPartitions;
}
#[derive(Debug)]
pub struct SyncPartitions {
pub layout_version: u64,
pub partitions: Vec<SyncPartition>,
}
#[derive(Debug)]
pub struct SyncPartition {
pub partition: Partition,
pub first_hash: Hash,
pub last_hash: Hash,
pub storage_sets: Vec<Vec<Uuid>>,
} }

View file

@ -1,6 +1,6 @@
use std::sync::Arc; use std::sync::Arc;
use garage_rpc::ring::*; use garage_rpc::layout::*;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_util::data::*; use garage_util::data::*;
@ -25,29 +25,59 @@ pub struct TableShardedReplication {
} }
impl TableReplication for TableShardedReplication { impl TableReplication for TableShardedReplication {
type WriteSets = WriteLock<Vec<Vec<Uuid>>>;
fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid> {
self.system.cluster_layout().storage_nodes_of(hash)
}
fn read_nodes(&self, hash: &Hash) -> Vec<Uuid> { fn read_nodes(&self, hash: &Hash) -> Vec<Uuid> {
let ring = self.system.ring.borrow(); self.system.cluster_layout().read_nodes_of(hash)
ring.get_nodes(hash, self.replication_factor)
} }
fn read_quorum(&self) -> usize { fn read_quorum(&self) -> usize {
self.read_quorum self.read_quorum
} }
fn write_nodes(&self, hash: &Hash) -> Vec<Uuid> { fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
let ring = self.system.ring.borrow(); self.system.layout_manager.write_sets_of(hash)
ring.get_nodes(hash, self.replication_factor)
} }
fn write_quorum(&self) -> usize { fn write_quorum(&self) -> usize {
self.write_quorum self.write_quorum
} }
fn max_write_errors(&self) -> usize {
self.replication_factor - self.write_quorum
}
fn partition_of(&self, hash: &Hash) -> Partition { fn partition_of(&self, hash: &Hash) -> Partition {
self.system.ring.borrow().partition_of(hash) self.system.cluster_layout().current().partition_of(hash)
} }
fn partitions(&self) -> Vec<(Partition, Hash)> {
self.system.ring.borrow().partitions() fn sync_partitions(&self) -> SyncPartitions {
let layout = self.system.cluster_layout();
let layout_version = layout.ack_map_min();
let mut partitions = layout
.current()
.partitions()
.map(|(partition, first_hash)| {
let storage_sets = layout.storage_sets_of(&first_hash);
SyncPartition {
partition,
first_hash,
last_hash: [0u8; 32].into(), // filled in just after
storage_sets,
}
})
.collect::<Vec<_>>();
for i in 0..partitions.len() {
partitions[i].last_hash = if i + 1 < partitions.len() {
partitions[i + 1].first_hash
} else {
[0xFFu8; 32].into()
};
}
SyncPartitions {
layout_version,
partitions,
}
} }
} }

Some files were not shown because too many files have changed in this diff Show more