update Cargo.nix

Add missing opentelemetry features
Add/Fix OpenTelemetry
2022-04-08 14:35:09 +02:00 · 2022-04-08 14:21:04 +02:00 · 2022-04-07 16:12:35 +02:00 · 2022-04-07 16:12:35 +02:00 · 2022-04-07 11:50:03 +02:00 · 2022-04-07 11:49:29 +02:00
101 changed files with 11999 additions and 2077 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.nix
+++ b/Cargo.nix
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,10 +3,12 @@ members = [
 	"src/util",
 	"src/rpc",
 	"src/table",
+	"src/block",
 	"src/model",
+	"src/admin",
 	"src/api",
 	"src/web",
-	"src/garage",
+	"src/garage"
 ]

 [profile.dev]
--- a/default.nix
+++ b/default.nix
@ -11,14 +11,26 @@ with import ./nix/common.nix;
 let
  crossSystem = { config = target; };
 in let
+  log = v: builtins.trace v v;
+
  pkgs = import pkgsSrc {
    inherit system crossSystem;
    overlays = [ cargo2nixOverlay ];
  };

+
+  /*
+   Rust and Nix triples are not the same. Cargo2nix has a dedicated library
+   to convert Nix triples to Rust ones. We need this conversion as we want to
+   set later options linked to our (rust) target in a generic way. Not only
+   the triple terminology is different, but also the "roles" are named differently.
+   Nix uses a build/host/target terminology where Nix's "host" maps to Cargo's "target".
+  */
+  rustTarget = log (pkgs.rustBuilder.rustLib.rustTriple pkgs.stdenv.hostPlatform);
+
  /*
   Cargo2nix is built for rustOverlay which installs Rust from Mozilla releases.
-   We want our own Rust to avoir incompatibilities, like we had with musl 1.2.0.
+   We want our own Rust to avoid incompatibilities, like we had with musl 1.2.0.
   rustc was built with musl < 1.2.0 and nix shipped musl >= 1.2.0 which lead to compilation breakage.
   So we want a Rust release that is bound to our Nix repository to avoid these problems.
   See here for more info: https://musl.libc.org/time64.html
@ -35,53 +47,93 @@ in let
    ];
  };

+  /*
+   Cargo2nix provides many overrides by default, you can take inspiration from them:
+   https://github.com/cargo2nix/cargo2nix/blob/master/overlay/overrides.nix
+
+   You can have a complete list of the available options by looking at the overriden object, mkcrate:
+   https://github.com/cargo2nix/cargo2nix/blob/master/overlay/mkcrate.nix
+  */
  overrides = pkgs.rustBuilder.overrides.all ++ [
    /*
-     We want to inject the git version while keeping the build deterministic.
+     [1] We need to alter Nix hardening to be able to statically compile: PIE,
+     Position Independent Executables seems to be supported only on amd64. Having
+     this flags set either make our executables crash or compile as dynamic on many platforms.
+     In the following section codegenOpts, we reactive it for the supported targets
+     (only amd64 curently) through the `-static-pie` flag. PIE is a feature used
+     by ASLR, which helps mitigate security issues.
+     Learn more about Nix Hardening: https://github.com/NixOS/nixpkgs/blob/master/pkgs/build-support/cc-wrapper/add-hardening.sh
+
+     [2] We want to inject the git version while keeping the build deterministic.
     As we do not want to consider the .git folder as part of the input source,
     we ask the user (the CI often) to pass the value to Nix.
    */
    (pkgs.rustBuilder.rustLib.makeOverride {
      name = "garage";
-      overrideAttrs = drv: if git_version != null then {
+      overrideAttrs = drv:
+        /* [1] */ { hardeningDisable = [ "pie" ]; }
+        //
+        /* [2] */ (if git_version != null then {
          preConfigure = ''
            ${drv.preConfigure or ""}
            export GIT_VERSION="${git_version}"
          '';
-      } else {};
+        } else {});
    })

    /*
-     On a sandbox pure NixOS environment, /usr/bin/file is not available.
-     This is a known problem: https://github.com/NixOS/nixpkgs/issues/98440
-     We simply patch the file as suggested
+     We ship some parts of the code disabled by default by putting them behind a flag.
+     It speeds up the compilation (when the feature is not required) and released crates have less dependency by default (less attack surface, disk space, etc.).
+     But we want to ship these additional features when we release Garage.
+     In the end, we chose to exclude all features from debug builds while putting (all of) them in the release builds.
+     Currently, the only feature of Garage is kubernetes-discovery from the garage_rpc crate.
    */
-    /*(pkgs.rustBuilder.rustLib.makeOverride {
-      name = "libsodium-sys";
-      overrideAttrs = drv: {
-        preConfigure = ''
-          ${drv.preConfigure or ""}
-          sed -i 's,/usr/bin/file,${file}/bin/file,g' ./configure
-        '';
-      }
-    })*/
+    (pkgs.rustBuilder.rustLib.makeOverride {
+      name = "garage_rpc";
+      overrideArgs = old:
+        {
+          features = if release then [ "kubernetes-discovery" ] else [];
+        };
+    })
  ];

  packageFun = import ./Cargo.nix;

+  /*
+    We compile fully static binaries with musl to simplify deployment on most systems.
+    When possible, we reactivate PIE hardening (see above).
+
+    Also, if you set the RUSTFLAGS environment variable, the following parameters will
+    be ignored.
+
+    For more information on static builds, please refer to Rust's RFC 1721.
+    https://rust-lang.github.io/rfcs/1721-crt-static.html#specifying-dynamicstatic-c-runtime-linkage
+  */
+
+  codegenOpts = {
+   "armv6l-unknown-linux-musleabihf" = [ "target-feature=+crt-static" "link-arg=-static" ]; /* compile as dynamic with static-pie */
+   "aarch64-unknown-linux-musl" = [ "target-feature=+crt-static" "link-arg=-static" ]; /* segfault with static-pie */
+   "i686-unknown-linux-musl" = [ "target-feature=+crt-static" "link-arg=-static" ]; /* segfault with static-pie */
+   "x86_64-unknown-linux-musl" = [ "target-feature=+crt-static" "link-arg=-static-pie" ];
+  };
+
  /*
   The following definition is not elegant as we use a low level function of Cargo2nix
-   that enables us to pass our custom rustChannel object
+   that enables us to pass our custom rustChannel object. We need this low level definition
+   to pass Nix's Rust toolchains instead of Mozilla's one.
+
+   target is mandatory but must be kept to null to allow cargo2nix to set it to the appropriate value
+   for each crate.
  */ 
  rustPkgs = pkgs.rustBuilder.makePackageSet {
-    inherit packageFun rustChannel release;
+    inherit packageFun rustChannel release codegenOpts;
    packageOverrides = overrides;
-    target = null; /* we set target to null because we want that cargo2nix computes it automatically */
+    target = null;

    buildRustPackages = pkgs.buildPackages.rustBuilder.makePackageSet {
-      inherit rustChannel packageFun;
+      inherit rustChannel packageFun codegenOpts;
      packageOverrides = overrides;
-      target = null; /* we set target to null because we want that cargo2nix computes it automatically */ 
+      target = null;
    };
  };

--- a/doc/book/connect/backup.md
+++ b/doc/book/connect/backup.md
@ -25,7 +25,22 @@ If you still want to use Borg, you can use it with `rclone mount`.

 ## Duplicati

-*External links:* [Duplicati Documentation > Storage Providers](https://github.com/kees-z/DuplicatiDocs/blob/master/docs/05-storage-providers.md#user-content-s3-compatible)
+*External links:* [Duplicati Documentation > Storage Providers](https://duplicati.readthedocs.io/en/latest/05-storage-providers/#s3-compatible)
+
+The following fields need to be specified:
+```
+Storage Type: S3 Compatible
+Use SSL: [ ] # Only if you have SSL
+Server: Custom server url (s3.garage.localhost:3900)
+Bucket name: bucket-name
+Bucket create region: Custom region value (garage) # Or as you've specified in garage.toml
+AWS Access ID: Key ID from "garage key info key-name"
+AWS Access Key: Secret key from "garage key info key-name"
+Client Library to use: Minio SDK
+```
+
+Click `Test connection` and then no when asked `The bucket name should start with your username, prepend automatically?`. Then it should say `Connection worked!`.
+

 ## knoxite

--- a/doc/book/connect/cli.md
+++ b/doc/book/connect/cli.md
@ -13,7 +13,8 @@ These tools are particularly suitable for debug, backups, website deployments or
 | [rclone](#rclone)     | ✅       |    |
 | [s3cmd](#s3cmd)     | ✅       |    |
 | [(Cyber)duck](#cyberduck)     | ✅       |    |
-| [WinSCP (libs3)](#winscp)     | ✅       | No instructions yet   |
+| [WinSCP (libs3)](#winscp)     | ✅       | CLI instructions only   |
+| [sftpgo](#sftpgo)     | ✅       |    |


 ## Minio client
@ -281,5 +282,59 @@ duck --delete garage:/my-files/an-object.txt

 ## WinSCP (libs3) {#winscp}

-*No instruction yet. You can find ones in french [in our wiki](https://wiki.deuxfleurs.fr/fr/Guide/Garage/WinSCP).*
+*You can find instructions on how to use the GUI in french [in our wiki](https://wiki.deuxfleurs.fr/fr/Guide/Garage/WinSCP).*
+
+How to use `winscp.com`, the CLI interface of WinSCP:
+
+```
+open s3://GKxxxxx:yyyyyyy@127.0.0.1:4443 -certificate=* -rawsettings S3DefaultRegion=garage S3UrlStyle=1
+ls
+ls my-files/
+get my-files/an-object.txt Z:\tmp\object.txt
+put Z:\tmp\object.txt my-files/another-object.txt
+rm my-files/an-object
+exit
+```
+
+Notes:
+  - It seems WinSCP supports only TLS connections for S3
+  - `-certificate=*` allows self-signed certificates, remove it if you have valid certificates
+
+
+## sftpgo {#sftpgo}
+
+sftpgo needs a database to work, by default it uses sqlite and does not require additional configuration.
+You can then directly init it:
+
+```
+sftpgo initprovider
+```
+
+Then you can directly launch the daemon that will listen by default on `:8080 (http)` and `:2022 (ssh)`:
+
+```
+sftpgo serve
+```
+
+Go to the admin web interface (http://[::1]:8080/web/admin/), create the required admin account, then create a user account.
+Choose a username (eg: `ada`) and a password.
+
+In the filesystem section, choose:
+  - Storage: AWS S3 (Compatible)
+  - Bucket: *your bucket name*
+  - Region: `garage` (or the one you defined in `config.toml`)
+  - Access key: *your access key*
+  - Access secret: *your secret key*
+  - Endpoint: *your endpoint*, eg. `https://garage.example.tld`, note that the protocol (`https` here) must be specified. Non standard ports and `http` have not been tested yet.
+  - Keep the default values for other fields
+  - Tick "Use path-style addressing". It should work without ticking it if you have correctly configured your instance to use URL vhost-style.
+
+Now you can access your bucket through SFTP:
+
+```
+sftp -P2022 ada@[::1]
+ls
+```
+
+And through the web interface at http://[::1]:8080/web/client

--- a/doc/book/cookbook/real-world.md
+++ b/doc/book/cookbook/real-world.md
@ -23,7 +23,7 @@ To run a real-world deployment, make sure the following conditions are met:

 - Ideally, each machine should have a SSD available in addition to the HDD you are dedicating
  to Garage. This will allow for faster access to metadata and has the potential
-  to drastically reduce Garage's response times.
+  to significantly reduce Garage's response times.

 - This guide will assume you are using Docker containers to deploy Garage on each node. 
  Garage can also be run independently, for instance as a [Systemd service](@/documentation/cookbook/systemd.md).
@ -35,12 +35,19 @@ For our example, we will suppose the following infrastructure with IPv6 connecti

 | Location | Name    | IP Address | Disk Space |
 |----------|---------|------------|------------|
-| Paris    | Mercury | fc00:1::1  | 1 To       |
-| Paris    | Venus   | fc00:1::2  | 2 To       |
-| London   | Earth   | fc00:B::1  | 2 To       |
-| Brussels | Mars    | fc00:F::1  | 1.5 To     |
-
+| Paris    | Mercury | fc00:1::1  | 1 TB       |
+| Paris    | Venus   | fc00:1::2  | 2 TB       |
+| London   | Earth   | fc00:B::1  | 2 TB       |
+| Brussels | Mars    | fc00:F::1  | 1.5 TB     |

+Note that Garage will **always** store the three copies of your data on nodes at different
+locations. This means that in the case of this small example, the available capacity
+of the cluster is in fact only 1.5 TB, because nodes in Brussels can't store more than that.
+This also means that nodes in Paris and London will be under-utilized.
+To make better use of the available hardware, you should ensure that the capacity
+available in the different locations of your cluster is roughly the same.
+For instance, here, the Mercury node could be moved to Brussels; this would allow the cluster
+to store 2 TB of data in total.

 ## Get a Docker image

@ -208,10 +215,10 @@ For our example, we will suppose we have the following infrastructure

 | Location | Name    | Disk Space | `Capacity` | `Identifier` | `Zone` |
 |----------|---------|------------|------------|--------------|--------------|
-| Paris    | Mercury | 1 To       | `10`       | `563e`     | `par1`       |
-| Paris    | Venus   | 2 To       | `20`       | `86f0`     | `par1`       |
-| London   | Earth   | 2 To       | `20`       | `6814`     | `lon1`       |
-| Brussels | Mars    | 1.5 To     | `15`       | `212f`     | `bru1`       |
+| Paris    | Mercury | 1 TB       | `10`       | `563e`     | `par1`       |
+| Paris    | Venus   | 2 TB       | `20`       | `86f0`     | `par1`       |
+| London   | Earth   | 2 TB       | `20`       | `6814`     | `lon1`       |
+| Brussels | Mars    | 1.5 TB     | `15`       | `212f`     | `bru1`       |

 #### Node identifiers

@ -261,10 +268,10 @@ have 66% chance of being stored by Venus and 33% chance of being stored by Mercu
 Given the information above, we will configure our cluster as follow:

 ```bash
-garage layout assign -z par1 -c 10 -t mercury 563e
-garage layout assign -z par1 -c 20 -t venus 86f0
-garage layout assign -z lon1 -c 20 -t earth 6814
-garage layout assign -z bru1 -c 15 -t mars 212f
+garage layout assign 563e -z par1 -c 10 -t mercury
+garage layout assign 86f0 -z par1 -c 20 -t venus
+garage layout assign 6814 -z lon1 -c 20 -t earth 
+garage layout assign 212f -z bru1 -c 15 -t mars 
 ```

 At this point, the changes in the cluster layout have not yet been applied.
--- a/doc/book/cookbook/upgrading.md
+++ b/doc/book/cookbook/upgrading.md
@ -0,0 +1,50 @@
+++
+title = "Upgrading Garage"
+weight = 40
+++
+
+Garage is a stateful clustered application, where all nodes are communicating together and share data structures.
+It makes upgrade more difficult than stateless applications so you must be more careful when upgrading.
+On a new version release, there is 2 possibilities:
+  - protocols and data structures remained the same ➡️ this is a **straightforward upgrade**
+  - protocols or data structures changed  ➡️  this is an **advanced upgrade**
+
+You can quickly now what type of update you will have to operate by looking at the version identifier.
+Following the [SemVer ](https://semver.org/) terminology, if only the *patch* number changed, it will only need a straightforward upgrade.
+Example: an upgrade from v0.6.0 from v0.6.1 is a straightforward upgrade.
+If the *minor* or *major* number changed however, you will have to do an advanced upgrade. Example: from v0.6.1 to v0.7.0.
+
+Migrations are designed to be run only between contiguous versions (from a *major*.*minor* perspective, *patches* can be skipped).
+Example: migrations from v0.6.1 to v0.7.0 and from v0.6.0 to v0.7.0 are supported but migrations from v0.5.0 to v0.7.0 are not supported.
+
+## Straightforward upgrades
+
+Straightforward upgrades do not imply cluster downtime.
+Before upgrading, you should still read [the changelog](https://git.deuxfleurs.fr/Deuxfleurs/garage/releases) and ideally test your deployment on a staging cluster before.
+
+When you are ready, start by checking the health of your cluster.
+You can force some checks with `garage repair`, we recommend at least running `garage repair --all-nodes --yes` that is very quick to run (less than a minute).
+You will see that the command correctly terminated in the logs of your daemon.
+
+Finally, you can simply upgrades nodes one by one. 
+For each node: stop it, install the new binary, edit the configuration if needed, restart it. 
+
+## Advanced upgrades
+
+Advanced upgrades will imply cluster downtime.
+Before upgrading, you must read [the changelog](https://git.deuxfleurs.fr/Deuxfleurs/garage/releases) and you must test your deployment on a staging cluster before.
+
+From a high level perspective, an advanced upgrade looks like this:
+  1. Make sure the health of your cluster is good (see `garage repair`)
+  2. Disable API access (comment the configuration in your reverse proxy)
+  3. Check that your cluster is idle
+  4. Stop the whole cluster
+  5. Backup the metadata folder of all your nodes, so that you will be able to restore it quickly if the upgrade fails (blocks being immutable, they should not be impacted)
+  6. Install the new binary, update the configuration
+  7. Start the whole cluster
+  8. If needed, run the corresponding migration from `garage migrate`
+  9. Make sure the health of your cluster is good
+  10. Enable API access (uncomment the configuration in your reverse proxy)
+  11. Monitor your cluster while load comes back, check that all your applications are happy with this new version
+
+We write guides for each advanced upgrade, they are stored under the "Working Documents" section of this documentation.
--- a/doc/book/reference-manual/configuration.md
+++ b/doc/book/reference-manual/configuration.md
@ -29,6 +29,10 @@ bootstrap_peers = [
 consul_host = "consul.service"
 consul_service_name = "garage-daemon"

+kubernetes_namespace = "garage"
+kubernetes_service_name = "garage-daemon"
+kubernetes_skip_crd = false
+
 sled_cache_capacity = 134217728
 sled_flush_every_ms = 2000

@ -40,6 +44,10 @@ root_domain = ".s3.garage"
 [s3_web]
 bind_addr = "[::]:3902"
 root_domain = ".web.garage"
+
+[admin]
+api_bind_addr = "0.0.0.0:3903"
+trace_sink = "http://localhost:4317"
 ```

 The following gives details about each available configuration option.
@ -80,20 +88,47 @@ might use more storage space that is optimally possible.

 Garage supports the following replication modes:

- `none` or `1`: data stored on Garage is stored on a single node. There is no redundancy,
-  and data will be unavailable as soon as one node fails or its network is disconnected.
-  Do not use this for anything else than test deployments.
+- `none` or `1`: data stored on Garage is stored on a single node. There is no
+  redundancy, and data will be unavailable as soon as one node fails or its
+  network is disconnected.  Do not use this for anything else than test
+  deployments.

- `2`: data stored on Garage will be stored on two different nodes, if possible in different
-  zones. Garage tolerates one node failure before losing data. Data should be available
-  read-only when one node is down, but write operations will fail.
-  Use this only if you really have to.
+- `2`: data stored on Garage will be stored on two different nodes, if possible
+  in different zones. Garage tolerates one node failure, or several nodes
+  failing but all in a single zone (in a deployment with at least two zones),
+  before losing data. Data remains available in read-only mode when one node is
+  down, but write operations will fail.

- `3`: data stored on Garage will be stored on three different nodes, if possible each in
-  a different zones.
-  Garage tolerates two node failure before losing data. Data should be available
-  read-only when two nodes are down, and writes should be possible if only a single node
-  is down.
+  - `2-dangerous`: a variant of mode `2`, where written objects are written to
+    the second replica asynchronously. This means that Garage will return `200
+    OK` to a PutObject request before the second copy is fully written (or even
+    before it even starts being written).  This means that data can more easily
+    be lost if the node crashes before a second copy can be completed.  This
+    also means that written objects might not be visible immediately in read
+    operations.  In other words, this mode severely breaks the consistency and
+    durability guarantees of standard Garage cluster operation.  Benefits of
+    this mode: you can still write to your cluster when one node is
+    unavailable.
+
+- `3`: data stored on Garage will be stored on three different nodes, if
+  possible each in a different zones.  Garage tolerates two node failure, or
+  several node failures but in no more than two zones (in a deployment with at
+  least three zones), before losing data. As long as only a single node fails,
+  or node failures are only in a single zone, reading and writing data to
+  Garage can continue normally.
+
+  - `3-degraded`: a variant of replication mode `3`, that lowers the read
+    quorum to `1`, to allow you to read data from your cluster when several
+    nodes (or nodes in several zones) are unavailable.  In this mode, Garage
+    does not provide read-after-write consistency anymore.  The write quorum is
+    still 2, ensuring that data successfully written to Garage is stored on at
+    least two nodes.
+
+  - `3-dangerous`: a variant of replication mode `3` that lowers both the read
+    and write quorums to `1`, to allow you to both read and write to your
+    cluster when several nodes (or nodes in several zones) are unavailable.  It
+    is the least consistent mode of operation proposed by Garage, and also one
+    that should probably never be used.

 Note that in modes `2` and `3`,
 if at least the same number of zones are available, an arbitrary number of failures in 
@ -102,8 +137,35 @@ any given zone is tolerated as copies of data will be spread over several zones.
 **Make sure `replication_mode` is the same in the configuration files of all nodes.
 Never run a Garage cluster where that is not the case.**

-Changing the `replication_mode` of a cluster might work (make sure to shut down all nodes
-and changing it everywhere at the time), but is not officially supported.
+The quorums associated with each replication mode are described below:
+
+| `replication_mode` | Number of replicas | Write quorum | Read quorum | Read-after-write consistency? |
+| ------------------ | ------------------ | ------------ | ----------- | ----------------------------- |
+| `none` or `1`      | 1                  | 1            | 1           | yes                           |
+| `2`                | 2                  | 2            | 1           | yes                           |
+| `2-dangerous`      | 2                  | 1            | 1           | NO                            |
+| `3`                | 3                  | 2            | 2           | yes                           |
+| `3-degraded`       | 3                  | 2            | 1           | NO                            |
+| `3-dangerous`      | 3                  | 1            | 1           | NO                            |
+
+Changing the `replication_mode` between modes with the same number of replicas
+(e.g. from `3` to `3-degraded`, or from `2-dangerous` to `2`), can be done easily by
+just changing the `replication_mode` parameter in your config files and restarting all your
+Garage nodes.
+
+It is also technically possible to change the replication mode to a mode with a
+different numbers of replicas, although it's a dangerous operation that is not
+officially supported.  This requires you to delete the existing cluster layout
+and create a new layout from scratch, meaning that a full rebalancing of your
+cluster's data will be needed.  To do it, shut down your cluster entirely,
+delete the `custer_layout` files in the meta directories of all your nodes,
+update all your configuration files with the new `replication_mode` parameter,
+restart your cluster, and then create a new layout with all the nodes you want
+to keep.  Rebalancing data will take some time, and data might temporarily
+appear unavailable to your users.  It is recommended to shut down public access
+to the cluster while rebalancing is in progress.  In theory, no data should be
+lost as rebalancing is a routine operation for Garage, although we cannot
+guarantee you that everything will go right in such an extreme scenario.

 ### `compression_level`

@ -181,6 +243,20 @@ RPC ports are announced.

 Garage does not yet support talking to Consul over TLS.

+### `kubernetes_namespace`, `kubernetes_service_name` and `kubernetes_skip_crd`
+
+Garage supports discovering other nodes of the cluster using kubernetes custom
+resources. For this to work `kubernetes_namespace` and `kubernetes_service_name`
+need to be configured.
+
+`kubernetes_namespace` sets the namespace in which the custom resources are
+configured. `kubernetes_service_name` is added as a label to these resources to
+filter them, to allow for multiple deployments in a single namespace.
+
+`kubernetes_skip_crd` can be set to true to disable the automatic creation and
+patching of the `garagenodes.deuxfleurs.fr` CRD. You will need to create the CRD
+manually.
+
 ### `sled_cache_capacity`

 This parameter can be used to tune the capacity of the cache used by
@ -242,3 +318,21 @@ For instance, if `root_domain` is `web.garage.eu`, a bucket called `deuxfleurs.f
 will be accessible either with hostname `deuxfleurs.fr.web.garage.eu`
 or with hostname `deuxfleurs.fr`.

+
+## The `[admin]` section
+
+Garage has a few administration capabilities, in particular to allow remote monitoring. These features are detailed below.
+
+### `api_bind_addr`
+
+If specified, Garage will bind an HTTP server to this port and address, on
+which it will listen to requests for administration features. Currently,
+this endpoint only exposes Garage metrics in the Prometheus format at
+`/metrics`. This endpoint is not authenticated. In the future, bucket and
+access key management might be possible by REST calls to this endpoint.
+
+### `trace_sink`
+
+Optionnally, the address of an Opentelemetry collector.  If specified,
+Garage will send traces in the Opentelemetry format to this endpoint. These
+trace allow to inspect Garage's operation when it handles S3 API requests.
--- a/doc/book/reference-manual/routing.md
+++ b/doc/book/reference-manual/routing.md
@ -0,0 +1,45 @@
+++
+title = "Request routing logic"
+weight = 10
+++
+
+Data retrieval requests to Garage endpoints (S3 API and websites) are resolved 
+to an individual object in a bucket. Since objects are replicated to multiple nodes 
+Garage must ensure consistency before answering the request.
+
+## Using quorum to ensure consistency
+
+Garage ensures consistency by attempting to establish a quorum with the
+data nodes responsible for the object. When a majority of the data nodes
+have provided metadata on a object Garage can then answer the request.
+
+When a request arrives Garage will, assuming the recommended 3 replicas, perform the following actions:
+
+- Make a request to the two preferred nodes for object metadata
+- Try the third node if one of the two initial requests fail
+- Check that the metadata from at least 2 nodes match
+- Check that the object hasn't been marked deleted
+- Answer the request with inline data from metadata if object is small enough
+- Or get data blocks from the preferred nodes and answer using the assembled object
+
+Garage dynamically determines which nodes to query based on health, preference, and 
+which nodes actually host a given data. Garage has no concept of "primary" so any 
+healthy node with the data can be used as long as a quorum is reached for the metadata.
+
+## Node health
+
+Garage keeps a TCP session open to each node in the cluster and periodically pings them. If a connection
+cannot be established, or a node fails to answer a number of pings, the target node is marked as failed.
+Failed nodes are not used for quorum or other internal requests.
+
+## Node preference
+
+Garage prioritizes which nodes to query according to a few criteria:
+
+- A node always prefers itself if it can answer the request
+- Then the node prioritizes nodes in the same zone
+- Finally the nodes with the lowest latency are prioritized 
+
+
+For further reading on the cluster structure look at the [gateway](@/documentation/cookbook/gateways.md) 
+and [cluster layout management](@/documentation/reference-manual/layout.md) pages.
--- a/doc/book/working-documents/migration-06.md
+++ b/doc/book/working-documents/migration-06.md
@ -4,12 +4,12 @@ weight = 15
 +++

 **This guide explains how to migrate to 0.6 if you have an existing 0.5 cluster.
-We don't recommend trying to migrate directly from 0.4 or older to 0.6.**
+We don't recommend trying to migrate to 0.6 directly from 0.4 or older.**

 **We make no guarantee that this migration will work perfectly:
 back up all your data before attempting it!**

-Garage v0.6 (not yet released) introduces a new data model for buckets,
+Garage v0.6 introduces a new data model for buckets,
 that allows buckets to have many names (aliases).
 Buckets can also have "private" aliases (called local aliases),
 which are only visible when using a certain access key.
--- a/doc/book/working-documents/migration-07.md
+++ b/doc/book/working-documents/migration-07.md
@ -0,0 +1,31 @@
+++
+title = "Migrating from 0.6 to 0.7"
+weight = 14
+++
+**This guide explains how to migrate to 0.7 if you have an existing 0.6 cluster.
+We don't recommend trying to migrate to 0.7 directly from 0.5 or older.**
+
+**We make no guarantee that this migration will work perfectly:
+back up all your data before attempting it!**
+
+Garage v0.7 introduces a cluster protocol change to support request tracing through OpenTelemetry.
+No data structure is changed, so no data migration is required.
+
+The migration steps are as follows:
+
+1. Do `garage repair --all-nodes --yes tables` and `garage repair --all-nodes --yes blocks`,
+   check the logs and check that all data seems to be synced correctly between
+   nodes. If you have time, do additional checks (`scrub`, `block_refs`, etc.)
+2. Disable api and web access. Garage does not support disabling
+   these endpoints but you can change the port number or stop your reverse
+   proxy for instance.
+3. Check once again that your cluster is healty. Run again `garage repair --all-nodes --yes tables` which is quick.
+   Also check your queues are empty, run `garage stats` to query them.
+4. Turn off Garage v0.6
+5. Backup the metadata folder of all your nodes: `cd /var/lib/garage ; tar -acf meta-v0.6.tar.zst meta/`
+6. Install Garage v0.7, edit the configuration if you plan to use OpenTelemetry or the Kubernetes integration
+7. Turn on Garage v0.7
+8. Do `garage repair --all-nodes --yes tables` and `garage repair --all-nodes --yes blocks`
+9. Your upgraded cluster should be in a working state. Re-enable API and Web
+    access and check that everything went well.
+10. Monitor your cluster in the next hours to see if it works well under your production load, report any issue.
--- a/nix/common.nix
+++ b/nix/common.nix
@ -8,10 +8,10 @@ rec {
    sha256 = "1xy9zpypqfxs5gcq5dcla4bfkhxmh5nzn9dyqkr03lqycm9wg5cr";
  };
  cargo2nixSrc = fetchGit {
-    # As of 2022-02-03
+    # As of 2022-03-17
    url = "https://github.com/superboum/cargo2nix";
-    ref = "backward-compat";
-    rev = "08d963f32a774353ee8acf3f61749915875c1ec4";
+    ref = "main";
+    rev = "bcbf3ba99e9e01a61eb83a24624419c2dd9dec64";
  };


--- a/nix/toolchain.nix
+++ b/nix/toolchain.nix
@ -18,6 +18,7 @@ let
  pkgsHost = import pkgsSrc {};
  lib = pkgsHost.lib;
  kaniko = (import ./kaniko.nix) pkgsHost;
+  winscp = (import ./winscp.nix) pkgsHost;
 in 
  lib.flatten (builtins.map (pkgs: [
     pkgs.rustPlatform.rust.rustc
@ -25,5 +26,6 @@ in
     pkgs.buildPackages.stdenv.cc
  ]) pkgsList) ++ [
    kaniko
+    winscp
  ]

--- a/nix/winscp.nix
+++ b/nix/winscp.nix
@ -0,0 +1,28 @@
+pkgs:
+
+pkgs.stdenv.mkDerivation rec {
+  pname = "winscp";
+  version = "5.19.6";
+
+  src = pkgs.fetchzip {
+    url = "https://winscp.net/download/WinSCP-${version}-Portable.zip";
+    sha256 = "sha256-8+6JuT0b1fFQ6etaFTMSjIKvDGzmJoHAuByXiqCBzu0=";
+    stripRoot = false;
+  };
+
+  buildPhase = ''
+    cat > winscp <<EOF
+#!${pkgs.bash}/bin/bash
+
+WINEDEBUG=-all
+${pkgs.winePackages.minimal}/bin/wine $out/opt/WinSCP.com
+EOF
+  '';
+
+  installPhase = ''
+    mkdir -p $out/{bin,opt}
+    cp {WinSCP.com,WinSCP.exe} $out/opt
+    cp winscp $out/bin
+    chmod +x $out/bin/winscp
+  '';
+}
--- a/script/dev-cluster.sh
+++ b/script/dev-cluster.sh
@ -44,6 +44,9 @@ root_domain = ".s3.garage.localhost"
 bind_addr = "0.0.0.0:$((3920+$count))"
 root_domain = ".web.garage.localhost"
 index = "index.html"
+
+[admin]
+api_bind_addr = "0.0.0.0:$((9900+$count))"
 EOF

 echo -en "$LABEL configuration written to $CONF_PATH\n"
--- a/script/dev-env-winscp.sh
+++ b/script/dev-env-winscp.sh
@ -0,0 +1,4 @@
+export AWS_ACCESS_KEY_ID=`cat /tmp/garage.s3 |cut -d' ' -f1`
+export AWS_SECRET_ACCESS_KEY=`cat /tmp/garage.s3 |cut -d' ' -f2`
+export AWS_DEFAULT_REGION='garage'
+export WINSCP_URL="s3://${AWS_ACCESS_KEY_ID}:${AWS_SECRET_ACCESS_KEY}@127.0.0.1:4443 -certificate=* -rawsettings S3DefaultRegion=garage S3UrlStyle=1"
--- a/script/k8s/README.md
+++ b/script/k8s/README.md
@ -0,0 +1,13 @@
+Spawn a cluster with minikube
+
+```bash
+minikube start
+minikube kubectl -- apply -f config.yaml
+minikube kubectl -- apply -f daemon.yaml
+minikube dashboard
+
+minikube kubectl -- exec -it garage-0 --container garage -- /garage status
+# etc.
+```
+
+
--- a/script/k8s/admin.yaml
+++ b/script/k8s/admin.yaml
@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: garage-admin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+- apiGroup: rbac.authorization.k8s.io
+  kind: User
+  name: system:serviceaccount:default:default
--- a/script/k8s/config.yaml
+++ b/script/k8s/config.yaml
@ -0,0 +1,30 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: garage-config
+  namespace: default
+data:
+  garage.toml: |-
+    metadata_dir = "/tmp/meta"
+    data_dir = "/tmp/data"
+
+    replication_mode = "3"
+
+    rpc_bind_addr = "[::]:3901"
+    rpc_secret = "1799bccfd7411eddcf9ebd316bc1f5287ad12a68094e1c6ac6abde7e6feae1ec"
+
+    bootstrap_peers = []
+
+    kubernetes_namespace = "default"
+    kubernetes_service_name = "garage-daemon"
+    kubernetes_skip_crd = false
+
+    [s3_api]
+    s3_region = "garage"
+    api_bind_addr = "[::]:3900"
+    root_domain = ".s3.garage.tld"
+
+    [s3_web]
+    bind_addr = "[::]:3902"
+    root_domain = ".web.garage.tld"
+    index = "index.html"
--- a/script/k8s/daemon.yaml
+++ b/script/k8s/daemon.yaml
@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: garage
+spec:
+  selector:
+    matchLabels:
+      app: garage
+  serviceName: "garage"
+  replicas: 3 
+  template:
+    metadata:
+      labels:
+        app: garage
+    spec:
+      terminationGracePeriodSeconds: 10
+      containers:
+      - name: garage
+        image: dxflrs/amd64_garage:v0.7.0-rc1
+        ports:
+        - containerPort: 3900
+          name: s3-api
+        - containerPort: 3902
+          name: web-api
+        volumeMounts:
+        - name: fast
+          mountPath: /mnt/fast
+        - name: slow
+          mountPath: /mnt/slow
+        - name: etc
+          mountPath: /etc/garage.toml
+          subPath: garage.toml
+      volumes:
+      - name: etc
+        configMap:
+          name: garage-config
+  volumeClaimTemplates:
+  - metadata:
+      name: fast
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 100Mi
+  - metadata:
+      name: slow
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 100Mi
+  
--- a/script/telemetry/README.md
+++ b/script/telemetry/README.md
@ -0,0 +1,21 @@
+Configure your `[admin-api]` endpoint:
+
+```
+[admin]
+api_bind_addr = "0.0.0.0:3903"
+trace_sink = "http://localhost:4317"
+```
+
+Start the test stack:
+
+```
+cd telemetry
+docker-compose up
+```
+
+Access the web interfaces:
+  - [Kibana](http://localhost:5601) - Click on the hamburger menu, in the Observability section, click APM
+  - [Grafana](http://localhost:3000) - Set a password, then on the left menu, click Dashboard -> Browse. On the new page click Import -> Choose the test dashboard we ship `grafana-garage-dashboard-elasticsearch.json` 
+
+
+
--- a/script/telemetry/elastic/.env
+++ b/script/telemetry/elastic/.env
@ -0,0 +1,3 @@
+COMPOSE_PROJECT_NAME=telemetry
+OTEL_COLLECT_TAG=0.44.0
+ELASTIC_BUNDLE_TAG=7.17.0
--- a/script/telemetry/elastic/apm-config.yaml
+++ b/script/telemetry/elastic/apm-config.yaml
@ -0,0 +1,10 @@
+apm-server:
+  # Defines the host and port the server is listening on. Use "unix:/path/to.sock" to listen on a unix domain socket.
+  host: "0.0.0.0:8200"
+#-------------------------- Elasticsearch output --------------------------
+output.elasticsearch:
+  # Array of hosts to connect to.
+  # Scheme and port can be left out and will be set to the default (`http` and `9200`).
+  # In case you specify and additional path, the scheme is required: `http://localhost:9200/path`.
+  # IPv6 addresses should always be defined as: `https://[2001:db8::1]:9200`.
+  hosts: ["localhost:9200"]
--- a/script/telemetry/elastic/docker-compose.yml
+++ b/script/telemetry/elastic/docker-compose.yml
@ -0,0 +1,69 @@
+version: "2"
+services:
+
+  otel:
+    image: otel/opentelemetry-collector-contrib:${OTEL_COLLECT_TAG}
+    command: [ "--config=/etc/otel-config.yaml" ]
+    volumes:
+      - ./otel-config.yaml:/etc/otel-config.yaml
+    network_mode: "host"
+
+  elastic:
+    image: docker.elastic.co/elasticsearch/elasticsearch:${ELASTIC_BUNDLE_TAG}
+    container_name: elastic
+    environment:
+      - "node.name=elastic"
+      - "http.port=9200"
+      - "cluster.name=es-docker-cluster"
+      - "discovery.type=single-node"
+      - "bootstrap.memory_lock=true"
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile: 65536
+    volumes:
+      - "es_data:/usr/share/elasticsearch/data"
+    network_mode: "host"
+
+  # kibana instance and collectors
+  # see https://www.elastic.co/guide/en/elastic-stack-get-started/current/get-started-docker.html
+  kibana:
+    image: docker.elastic.co/kibana/kibana:${ELASTIC_BUNDLE_TAG}
+    container_name: kibana
+    environment:
+      SERVER_NAME: "kibana.local"
+      # ELASTICSEARCH_URL: "http://localhost:9700"
+      ELASTICSEARCH_HOSTS: "http://localhost:9200"
+    depends_on: [ 'elastic' ]
+    network_mode: "host"
+
+  apm:
+    image: docker.elastic.co/apm/apm-server:${ELASTIC_BUNDLE_TAG}
+    container_name: apm
+    volumes:
+      - "./apm-config.yaml:/usr/share/apm-server/apm-server.yml:ro"
+    depends_on: [ 'elastic' ]
+    network_mode: "host"
+
+  grafana:
+    # see https://grafana.com/docs/grafana/latest/installation/docker/
+    image: "grafana/grafana:8.3.5"
+    container_name: grafana
+    # restart: unless-stopped
+    environment:
+      - "GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel"
+    network_mode: "host"
+    volumes:
+      # chown 472:472 if needed
+      - grafana:/var/lib/grafana
+      - ./grafana/provisioning/:/etc/grafana/provisioning/
+
+volumes:
+  es_data:
+    driver: local
+  grafana:
+    driver: local
+  metricbeat:
+    driver: local
--- a/script/telemetry/elastic/grafana/provisioning/datasources/elastic.yaml
+++ b/script/telemetry/elastic/grafana/provisioning/datasources/elastic.yaml
@ -0,0 +1,19 @@
+apiVersion: 1
+
+datasources:
+  - name: DS_ELASTICSEARCH
+    type: elasticsearch
+    access: proxy
+    url: http://localhost:9200
+    password: ''
+    user: ''
+    database: apm-*
+    basicAuth: false
+    isDefault: true
+    jsonData:
+      esVersion: 7.10.0
+      logLevelField: ''
+      logMessageField: ''
+      maxConcurrentShardRequests: 5
+      timeField: "@timestamp"
+    readOnly: false
--- a/script/telemetry/elastic/otel-config.yaml
+++ b/script/telemetry/elastic/otel-config.yaml
@ -0,0 +1,47 @@
+receivers:
+  # Data sources: metrics, traces
+  otlp:
+    protocols:
+      grpc:
+        endpoint: ":4317"
+      http:
+        endpoint: ":55681"
+  # Data sources: metrics
+  prometheus:
+    config:
+      scrape_configs:
+        - job_name: "garage"
+          scrape_interval: 5s
+          static_configs:
+            - targets: ["localhost:3903"]
+
+exporters:
+  logging:
+    logLevel: info
+  # see https://www.elastic.co/guide/en/apm/get-started/current/open-telemetry-elastic.html#open-telemetry-collector
+  otlp/elastic:
+    endpoint: "localhost:8200"
+    tls:
+      insecure: true
+
+processors:
+  batch:
+
+extensions:
+  health_check:
+  pprof:
+    endpoint: :1888
+  zpages:
+    endpoint: :55679
+
+service:
+  extensions: [pprof, zpages, health_check]
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [logging, otlp/elastic]
+    metrics:
+      receivers: [otlp, prometheus]
+      processors: [batch]
+      exporters: [logging, otlp/elastic]
--- a/script/telemetry/grafana-garage-dashboard-elasticsearch.json
+++ b/script/telemetry/grafana-garage-dashboard-elasticsearch.json
--- a/script/test-smoke.sh
+++ b/script/test-smoke.sh
@ -116,295 +116,33 @@ if [ -z "$SKIP_DUCK" ]; then
  done
 fi

-# Advanced testing via S3API
-if [ -z "$SKIP_AWS" ]; then
-  echo "🔌 Test S3API"
-
-  echo "Test Objects"
-  aws s3api put-object --bucket eprouvette --key a
-  aws s3api put-object --bucket eprouvette --key a/a
-  aws s3api put-object --bucket eprouvette --key a/b
-  aws s3api put-object --bucket eprouvette --key a/c
-  aws s3api put-object --bucket eprouvette --key a/d/a
-  aws s3api put-object --bucket eprouvette --key a/é
-  aws s3api put-object --bucket eprouvette --key b
-  aws s3api put-object --bucket eprouvette --key c
-
-
-  aws s3api list-objects-v2 --bucket eprouvette >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --page-size 0 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --page-size 999999999 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --page-size 1 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --delimiter '/' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects-v2 --bucket eprouvette --delimiter '/' --page-size 1 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects-v2 --bucket eprouvette --prefix 'a/' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --prefix 'a/' --delimiter '/' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 4 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects-v2 --bucket eprouvette --prefix 'a/' --page-size 1 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --prefix 'a/' --delimiter '/' --page-size 1 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 4 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects-v2 --bucket eprouvette --start-after 'Z' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects-v2 --bucket eprouvette --start-after 'c' >$CMDOUT
-  ! [ -s $CMDOUT ]
-
-
-  aws s3api list-objects --bucket eprouvette >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects --bucket eprouvette --page-size 1 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects --bucket eprouvette --delimiter '/' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  # @FIXME it does not work as expected but might be a limitation of aws s3api
-  # The problem is the conjunction of a delimiter + pagination + v1 of listobjects
-  #aws s3api list-objects --bucket eprouvette --delimiter '/' --page-size 1 >$CMDOUT
-  #[ $(jq '.Contents | length' $CMDOUT) == 3 ]
-  #[ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects --bucket eprouvette --prefix 'a/' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects --bucket eprouvette --prefix 'a/' --delimiter '/' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 4 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects --bucket eprouvette --prefix 'a/' --page-size 1 >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  # @FIXME idem
-  #aws s3api list-objects --bucket eprouvette --prefix 'a/' --delimiter '/' --page-size 1 >$CMDOUT
-  #[ $(jq '.Contents | length' $CMDOUT) == 4 ]
-  #[ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-objects --bucket eprouvette --starting-token 'Z' >$CMDOUT
-  [ $(jq '.Contents | length' $CMDOUT) == 8 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-objects --bucket eprouvette --starting-token 'c' >$CMDOUT
-  ! [ -s $CMDOUT ]
-
-  aws s3api list-objects-v2 --bucket eprouvette | \
-    jq -c '. | {Objects: [.Contents[] | {Key: .Key}], Quiet: true}' | \
-    aws s3api delete-objects --bucket eprouvette --delete file:///dev/stdin
-
-
-  echo "Test Multipart Upload"
-  aws s3api create-multipart-upload --bucket eprouvette --key a
-  aws s3api create-multipart-upload --bucket eprouvette --key a
-  aws s3api create-multipart-upload --bucket eprouvette --key c
-  aws s3api create-multipart-upload --bucket eprouvette --key c/a
-  aws s3api create-multipart-upload --bucket eprouvette --key c/b
-
-  aws s3api list-multipart-uploads --bucket eprouvette >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --page-size 1 >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --delimiter '/' >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --delimiter '/' --page-size 1 >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --prefix 'c' >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --prefix 'c' --page-size 1 >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 3 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --prefix 'c' --delimiter '/' >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 1 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --prefix 'c' --delimiter '/' --page-size 1 >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 1 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 1 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --starting-token 'ZZZZZ' >$CMDOUT
-  [ $(jq '.Uploads | length' $CMDOUT) == 5 ]
-  [ $(jq '.CommonPrefixes | length' $CMDOUT) == 0 ]
-  aws s3api list-multipart-uploads --bucket eprouvette --starting-token 'd' >$CMDOUT
-  ! [ -s $CMDOUT ]
-
-  aws s3api list-multipart-uploads --bucket eprouvette | \
-    jq -r '.Uploads[] | "\(.Key) \(.UploadId)"' | \
-    while read r; do 
-      key=$(echo $r|cut -d' ' -f 1); 
-      uid=$(echo $r|cut -d' ' -f 2); 
-      aws s3api abort-multipart-upload --bucket eprouvette --key $key --upload-id $uid;
-      echo "Deleted ${key}:${uid}"
+if [ -z "$SKIP_WINSCP" ]; then
+  echo "🛠️ Testing with winscp"
+  source ${SCRIPT_FOLDER}/dev-env-winscp.sh
+  winscp <<EOF
+open $WINSCP_URL
+ls
+mkdir eprouvette/winscp
+EOF
+  for idx in {1..3}.{rnd,b64}; do
+    winscp <<EOF
+open $WINSCP_URL
+put Z:\\tmp\\garage.$idx eprouvette/winscp/garage.$idx.winscp
+ls eprouvette/winscp/
+get eprouvette/winscp/garage.$idx.winscp Z:\\tmp\\garage.$idx.dl
+rm eprouvette/winscp/garage.$idx.winscp
+EOF
+    diff /tmp/garage.$idx /tmp/garage.$idx.dl
+    rm /tmp/garage.$idx.dl
  done
-
-  echo "Test for ListParts"
-  UPLOAD_ID=$(aws s3api create-multipart-upload --bucket eprouvette --key list-parts | jq -r .UploadId)
-  aws s3api list-parts --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID >$CMDOUT
-  [ $(jq '.Parts | length' $CMDOUT) == 0 ]
-  [ $(jq -r '.StorageClass' $CMDOUT) == 'STANDARD' ] # check that the result is not empty
-  ETAG1=$(aws s3api upload-part --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID --part-number 1 --body /tmp/garage.2.rnd | jq .ETag)
-  aws s3api list-parts --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID >$CMDOUT
-  [ $(jq '.Parts | length' $CMDOUT) == 1 ]
-  [ $(jq '.Parts[0].PartNumber' $CMDOUT) == 1 ]
-  [ $(jq '.Parts[0].Size' $CMDOUT) == 5242880 ]
-  [ $(jq '.Parts[0].ETag' $CMDOUT) == $ETAG1 ]
-
-  ETAG2=$(aws s3api upload-part --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID --part-number 3 --body /tmp/garage.3.rnd | jq .ETag)
-  ETAG3=$(aws s3api upload-part --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID --part-number 2 --body /tmp/garage.2.rnd | jq .ETag)
-  aws s3api list-parts --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID >$CMDOUT
-  [ $(jq '.Parts | length' $CMDOUT) == 3 ]
-  [ $(jq '.Parts[1].ETag' $CMDOUT) == $ETAG3 ]
-
-  aws s3api list-parts --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID --page-size 1 >$CMDOUT
-  [ $(jq '.Parts | length' $CMDOUT) == 3 ]
-  [ $(jq '.Parts[1].ETag' $CMDOUT) == $ETAG3 ]
-
-  cat >/tmp/garage.multipart_struct <<EOF
-{
-  "Parts": [
-    {
-      "ETag": $ETAG1,
-      "PartNumber": 1
-    },
-    {
-      "ETag": $ETAG3,
-      "PartNumber": 2
-    },
-    {
-      "ETag": $ETAG2,
-      "PartNumber": 3
-    }
-  ]
-}
+  winscp <<EOF
+open $WINSCP_URL
+rm eprouvette/winscp
 EOF
-  aws s3api complete-multipart-upload \
-    --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID \
-    --multipart-upload file:///tmp/garage.multipart_struct
-
-  ! aws s3api list-parts --bucket eprouvette --key list-parts --upload-id $UPLOAD_ID >$CMDOUT
-  aws s3 rm "s3://eprouvette/list-parts"
-
-
-  # @FIXME We do not write tests with --starting-token due to a bug with awscli
-  # See here: https://github.com/aws/aws-cli/issues/6666
-
-  echo "Test for UploadPartCopy"
-  aws s3 cp "/tmp/garage.3.rnd" "s3://eprouvette/copy_part_source"
-  UPLOAD_ID=$(aws s3api create-multipart-upload --bucket eprouvette --key test_multipart | jq -r .UploadId)
-  PART1=$(aws s3api upload-part \
-    --bucket eprouvette --key test_multipart \
-    --upload-id $UPLOAD_ID --part-number 1 \
-    --body /tmp/garage.2.rnd | jq .ETag)
-  PART2=$(aws s3api upload-part-copy \
-    --bucket eprouvette --key test_multipart \
-    --upload-id $UPLOAD_ID --part-number 2 \
-    --copy-source "/eprouvette/copy_part_source" \
-    --copy-source-range "bytes=500-5000500" \
-    | jq .CopyPartResult.ETag)
-  PART3=$(aws s3api upload-part \
-    --bucket eprouvette --key test_multipart \
-    --upload-id $UPLOAD_ID --part-number 3 \
-    --body /tmp/garage.3.rnd | jq .ETag)
-  cat >/tmp/garage.multipart_struct <<EOF
-{
-  "Parts": [
-    {
-      "ETag": $PART1,
-      "PartNumber": 1
-    },
-    {
-      "ETag": $PART2,
-      "PartNumber": 2
-    },
-    {
-      "ETag": $PART3,
-      "PartNumber": 3
-    }
-  ]
-}
-EOF
-  aws s3api complete-multipart-upload \
-    --bucket eprouvette --key test_multipart --upload-id $UPLOAD_ID \
-    --multipart-upload file:///tmp/garage.multipart_struct
-
-  aws s3 cp "s3://eprouvette/test_multipart" /tmp/garage.test_multipart
-  cat /tmp/garage.2.rnd <(tail -c +501 /tmp/garage.3.rnd | head -c 5000001) /tmp/garage.3.rnd > /tmp/garage.test_multipart_reference
-  diff /tmp/garage.test_multipart /tmp/garage.test_multipart_reference >/tmp/garage.test_multipart_diff 2>&1
-
-  aws s3 rm "s3://eprouvette/copy_part_source"
-  aws s3 rm "s3://eprouvette/test_multipart"
-
-  rm /tmp/garage.multipart_struct
-  rm /tmp/garage.test_multipart
-  rm /tmp/garage.test_multipart_reference
-  rm /tmp/garage.test_multipart_diff
-
-
-  echo "Test CORS endpoints"
-  garage -c /tmp/config.1.toml bucket website --allow eprouvette
-  aws s3api put-object --bucket eprouvette --key index.html
-  CORS='{"CORSRules":[{"AllowedHeaders":["*"],"AllowedMethods":["GET","PUT"],"AllowedOrigins":["*"]}]}'
-  aws s3api put-bucket-cors --bucket eprouvette --cors-configuration $CORS
-  [ `aws s3api get-bucket-cors --bucket eprouvette | jq -c` == $CORS ]
-
-  curl -s -i -H 'Origin: http://example.com' --header "Host: eprouvette.web.garage.localhost" http://127.0.0.1:3921/ | grep access-control-allow-origin
-  curl -s -i -X OPTIONS -H 'Access-Control-Request-Method: PUT' -H 'Origin: http://example.com' --header "Host: eprouvette.web.garage.localhost" http://127.0.0.1:3921/ | grep access-control-allow-methods
-  curl -s -i -X OPTIONS -H 'Access-Control-Request-Method: DELETE' -H 'Origin: http://example.com' --header "Host: eprouvette.web.garage.localhost" http://127.0.0.1:3921/ | grep '403 Forbidden'
-
-  #@TODO we may want to test the S3 endpoint but we need to handle authentication, which is way more complex.
-
-  aws s3api delete-bucket-cors --bucket eprouvette
-  ! [ -s `aws s3api get-bucket-cors --bucket eprouvette` ]
-  curl -s -i -X OPTIONS -H 'Access-Control-Request-Method: PUT' -H 'Origin: http://example.com' --header "Host: eprouvette.web.garage.localhost" http://127.0.0.1:3921/ | grep '403 Forbidden'
-  aws s3api delete-object --bucket eprouvette --key index.html
-  garage -c /tmp/config.1.toml bucket website --deny eprouvette
 fi

 rm /tmp/garage.{1..3}.{rnd,b64}

-if [ -z "$SKIP_AWS" ]; then
-  echo "🪣 Test bucket logic "
-  AWS_ACCESS_KEY_ID=`cat /tmp/garage.s3 |cut -d' ' -f1`
-  [ $(aws s3 ls | wc -l) == 1 ]
-  garage -c /tmp/config.1.toml bucket create seau
-  garage -c /tmp/config.1.toml bucket allow --read seau --key $AWS_ACCESS_KEY_ID
-  [ $(aws s3 ls | wc -l) == 2 ]
-  garage -c /tmp/config.1.toml bucket deny --read seau --key $AWS_ACCESS_KEY_ID
-  [ $(aws s3 ls | wc -l) == 1 ]
-  garage -c /tmp/config.1.toml bucket allow --read seau --key $AWS_ACCESS_KEY_ID
-  [ $(aws s3 ls | wc -l) == 2 ]
-  garage -c /tmp/config.1.toml bucket delete --yes seau
-  [ $(aws s3 ls | wc -l) == 1 ]
-fi
-
-if [ -z "$SKIP_AWS" ]; then
-  echo "🧪 Website Testing"
-  echo "<h1>hello world</h1>" > /tmp/garage-index.html
-  aws s3 cp /tmp/garage-index.html s3://eprouvette/index.html
-  [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.web.garage.localhost"  http://127.0.0.1:3921/ ` == 404 ]
-  garage -c /tmp/config.1.toml bucket website --allow eprouvette
-  [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.web.garage.localhost"  http://127.0.0.1:3921/ ` == 200 ]
-  garage -c /tmp/config.1.toml bucket website --deny eprouvette
-  [ `curl -s -o /dev/null -w "%{http_code}" --header "Host: eprouvette.web.garage.localhost"  http://127.0.0.1:3921/ ` == 404 ]
-  aws s3 rm s3://eprouvette/index.html
-  rm /tmp/garage-index.html
-fi
-
 echo "🏁 Teardown"
 AWS_ACCESS_KEY_ID=`cat /tmp/garage.s3 |cut -d' ' -f1`
 AWS_SECRET_ACCESS_KEY=`cat /tmp/garage.s3 |cut -d' ' -f2`
--- a/shell.nix
+++ b/shell.nix
@ -13,6 +13,7 @@ let
    overlays = [ cargo2nixOverlay ];
  };
  kaniko = (import ./nix/kaniko.nix) pkgs;
+  winscp = (import ./nix/winscp.nix) pkgs;

 in

@ -76,10 +77,13 @@ function refresh_toolchain {
     pkgs.rustPlatform.rust.cargo
     pkgs.clippy
     pkgs.rustfmt
+     pkgs.perl
+     pkgs.protobuf
     cargo2nix.packages.x86_64-linux.cargo2nix
    ] else [])
   ++
   (if integration then [
+     winscp
     pkgs.s3cmd
     pkgs.awscli2
     pkgs.minio-client
--- a/src/admin/Cargo.toml
+++ b/src/admin/Cargo.toml
@ -0,0 +1,29 @@
+[package]
+name = "garage_admin"
+version = "0.7.0"
+authors = ["Maximilien Richer <code@mricher.fr>"]
+edition = "2018"
+license = "AGPL-3.0"
+description = "Administration and metrics REST HTTP server for Garage"
+repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+
+[lib]
+path = "lib.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+garage_util = { version = "0.7.0", path = "../util" }
+
+hex = "0.4"
+
+futures = "0.3"
+futures-util = "0.3"
+http = "0.2"
+hyper = "0.14"
+tracing = "0.1.30"
+
+opentelemetry = { version = "0.17", features = [ "rt-tokio" ] }
+opentelemetry-prometheus = "0.10"
+opentelemetry-otlp = "0.10"
+prometheus = "0.13"
--- a/src/admin/lib.rs
+++ b/src/admin/lib.rs
@ -0,0 +1,6 @@
+//! Crate for handling the admin and metric HTTP APIs
+#[macro_use]
+extern crate tracing;
+
+pub mod metrics;
+pub mod tracing_setup;
--- a/src/admin/metrics.rs
+++ b/src/admin/metrics.rs
@ -0,0 +1,146 @@
+use std::convert::Infallible;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::time::SystemTime;
+
+use futures::future::*;
+use hyper::{
+	header::CONTENT_TYPE,
+	service::{make_service_fn, service_fn},
+	Body, Method, Request, Response, Server,
+};
+
+use opentelemetry::{
+	global,
+	metrics::{BoundCounter, BoundValueRecorder},
+	trace::{FutureExt, TraceContextExt, Tracer},
+	Context,
+};
+use opentelemetry_prometheus::PrometheusExporter;
+
+use prometheus::{Encoder, TextEncoder};
+
+use garage_util::error::Error as GarageError;
+use garage_util::metrics::*;
+
+// serve_req on metric endpoint
+async fn serve_req(
+	req: Request<Body>,
+	admin_server: Arc<AdminServer>,
+) -> Result<Response<Body>, hyper::Error> {
+	debug!("Receiving request at path {}", req.uri());
+	let request_start = SystemTime::now();
+
+	admin_server.metrics.http_counter.add(1);
+
+	let response = match (req.method(), req.uri().path()) {
+		(&Method::GET, "/metrics") => {
+			let mut buffer = vec![];
+			let encoder = TextEncoder::new();
+
+			let tracer = opentelemetry::global::tracer("garage");
+			let metric_families = tracer.in_span("admin/gather_metrics", |_| {
+				admin_server.exporter.registry().gather()
+			});
+
+			encoder.encode(&metric_families, &mut buffer).unwrap();
+			admin_server
+				.metrics
+				.http_body_gauge
+				.record(buffer.len() as u64);
+
+			Response::builder()
+				.status(200)
+				.header(CONTENT_TYPE, encoder.format_type())
+				.body(Body::from(buffer))
+				.unwrap()
+		}
+		_ => Response::builder()
+			.status(404)
+			.body(Body::from("Not implemented"))
+			.unwrap(),
+	};
+
+	admin_server
+		.metrics
+		.http_req_histogram
+		.record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+	Ok(response)
+}
+
+// AdminServer hold the admin server internal admin_server and the metric exporter
+pub struct AdminServer {
+	exporter: PrometheusExporter,
+	metrics: AdminServerMetrics,
+}
+
+// GarageMetricadmin_server holds the metrics counter definition for Garage
+// FIXME: we would rather have that split up among the different libraries?
+struct AdminServerMetrics {
+	http_counter: BoundCounter<u64>,
+	http_body_gauge: BoundValueRecorder<u64>,
+	http_req_histogram: BoundValueRecorder<f64>,
+}
+
+impl AdminServer {
+	/// init initilialize the AdminServer and background metric server
+	pub fn init() -> AdminServer {
+		let exporter = opentelemetry_prometheus::exporter().init();
+		let meter = global::meter("garage/admin_server");
+		AdminServer {
+			exporter,
+			metrics: AdminServerMetrics {
+				http_counter: meter
+					.u64_counter("admin.http_requests_total")
+					.with_description("Total number of HTTP requests made.")
+					.init()
+					.bind(&[]),
+				http_body_gauge: meter
+					.u64_value_recorder("admin.http_response_size_bytes")
+					.with_description("The metrics HTTP response sizes in bytes.")
+					.init()
+					.bind(&[]),
+				http_req_histogram: meter
+					.f64_value_recorder("admin.http_request_duration_seconds")
+					.with_description("The HTTP request latencies in seconds.")
+					.init()
+					.bind(&[]),
+			},
+		}
+	}
+	/// run execute the admin server on the designated HTTP port and listen for requests
+	pub async fn run(
+		self,
+		bind_addr: SocketAddr,
+		shutdown_signal: impl Future<Output = ()>,
+	) -> Result<(), GarageError> {
+		let admin_server = Arc::new(self);
+		// For every connection, we must make a `Service` to handle all
+		// incoming HTTP requests on said connection.
+		let make_svc = make_service_fn(move |_conn| {
+			let admin_server = admin_server.clone();
+			// This is the `Service` that will handle the connection.
+			// `service_fn` is a helper to convert a function that
+			// returns a Response into a `Service`.
+			async move {
+				Ok::<_, Infallible>(service_fn(move |req| {
+					let tracer = opentelemetry::global::tracer("garage");
+					let span = tracer
+						.span_builder("admin/request")
+						.with_trace_id(gen_trace_id())
+						.start(&tracer);
+
+					serve_req(req, admin_server.clone())
+						.with_context(Context::current_with_span(span))
+				}))
+			}
+		});
+
+		let server = Server::bind(&bind_addr).serve(make_svc);
+		let graceful = server.with_graceful_shutdown(shutdown_signal);
+		info!("Admin server listening on http://{}", bind_addr);
+
+		graceful.await?;
+		Ok(())
+	}
+}
--- a/src/admin/tracing_setup.rs
+++ b/src/admin/tracing_setup.rs
@ -0,0 +1,37 @@
+use std::time::Duration;
+
+use opentelemetry::sdk::{
+	trace::{self, IdGenerator, Sampler},
+	Resource,
+};
+use opentelemetry::KeyValue;
+use opentelemetry_otlp::WithExportConfig;
+
+use garage_util::data::*;
+use garage_util::error::*;
+
+pub fn init_tracing(export_to: &str, node_id: Uuid) -> Result<(), Error> {
+	let node_id = hex::encode(&node_id.as_slice()[..8]);
+
+	opentelemetry_otlp::new_pipeline()
+		.tracing()
+		.with_exporter(
+			opentelemetry_otlp::new_exporter()
+				.tonic()
+				.with_endpoint(export_to)
+				.with_timeout(Duration::from_secs(3)),
+		)
+		.with_trace_config(
+			trace::config()
+				.with_id_generator(IdGenerator::default())
+				.with_sampler(Sampler::AlwaysOn)
+				.with_resource(Resource::new(vec![
+					KeyValue::new("service.name", "garage"),
+					KeyValue::new("service.instance.id", node_id),
+				])),
+		)
+		.install_batch(opentelemetry::runtime::Tokio)
+		.ok_or_message("Unable to initialize tracing")?;
+
+	Ok(())
+}
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage_api"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -14,9 +14,10 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-garage_model = { version = "0.6.0", path = "../model" }
-garage_table = { version = "0.6.0", path = "../table" }
-garage_util = { version = "0.6.0", path = "../util" }
+garage_model = { version = "0.7.0", path = "../model" }
+garage_table = { version = "0.7.0", path = "../table" }
+garage_block = { version = "0.7.0", path = "../block" }
+garage_util = { version = "0.7.0", path = "../util" }

 base64 = "0.13"
 bytes = "1.0"
@ -26,7 +27,7 @@ err-derive = "0.3"
 hex = "0.4"
 hmac = "0.10"
 idna = "0.2"
-log = "0.4"
+tracing = "0.1.30"
 md-5 = "0.9"
 nom = "7.1"
 sha2 = "0.9"
@ -49,3 +50,5 @@ serde_bytes = "0.11"
 serde_json = "1.0"
 quick-xml = { version = "0.21", features = [ "serialize" ] }
 url = "2.1"
+
+opentelemetry = "0.17"
--- a/src/api/api_server.rs
+++ b/src/api/api_server.rs
@ -1,14 +1,24 @@
 use std::net::SocketAddr;
 use std::sync::Arc;

+use chrono::{DateTime, NaiveDateTime, Utc};
 use futures::future::Future;
+use futures::prelude::*;
 use hyper::header;
 use hyper::server::conn::AddrStream;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server};

+use opentelemetry::{
+	global,
+	metrics::{Counter, ValueRecorder},
+	trace::{FutureExt, TraceContextExt, Tracer},
+	Context, KeyValue,
+};
+
 use garage_util::data::*;
 use garage_util::error::Error as GarageError;
+use garage_util::metrics::{gen_trace_id, RecordDuration};

 use garage_model::garage::Garage;
 use garage_model::key_table::Key;
@ -16,7 +26,10 @@ use garage_model::key_table::Key;
 use garage_table::util::*;

 use crate::error::*;
+use crate::signature::compute_scope;
 use crate::signature::payload::check_payload_signature;
+use crate::signature::streaming::SignedPayloadStream;
+use crate::signature::LONG_DATETIME;

 use crate::helpers::*;
 use crate::s3_bucket::*;
@ -30,6 +43,34 @@ use crate::s3_put::*;
 use crate::s3_router::{Authorization, Endpoint};
 use crate::s3_website::*;

+struct ApiMetrics {
+	request_counter: Counter<u64>,
+	error_counter: Counter<u64>,
+	request_duration: ValueRecorder<f64>,
+}
+
+impl ApiMetrics {
+	fn new() -> Self {
+		let meter = global::meter("garage/api");
+		Self {
+			request_counter: meter
+				.u64_counter("api.request_counter")
+				.with_description("Number of API calls to the various S3 API endpoints")
+				.init(),
+			error_counter: meter
+				.u64_counter("api.error_counter")
+				.with_description(
+					"Number of API calls to the various S3 API endpoints that resulted in errors",
+				)
+				.init(),
+			request_duration: meter
+				.f64_value_recorder("api.request_duration")
+				.with_description("Duration of API calls to the various S3 API endpoints")
+				.init(),
+		}
+	}
+}
+
 /// Run the S3 API server
 pub async fn run_api_server(
 	garage: Arc<Garage>,
@ -37,13 +78,19 @@ pub async fn run_api_server(
 ) -> Result<(), GarageError> {
 	let addr = &garage.config.s3_api.api_bind_addr;

+	let metrics = Arc::new(ApiMetrics::new());
+
 	let service = make_service_fn(|conn: &AddrStream| {
 		let garage = garage.clone();
+		let metrics = metrics.clone();
+
 		let client_addr = conn.remote_addr();
 		async move {
 			Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
 				let garage = garage.clone();
-				handler(garage, req, client_addr)
+				let metrics = metrics.clone();
+
+				handler(garage, metrics, req, client_addr)
 			}))
 		}
 	});
@ -59,13 +106,29 @@ pub async fn run_api_server(

 async fn handler(
 	garage: Arc<Garage>,
+	metrics: Arc<ApiMetrics>,
 	req: Request<Body>,
 	addr: SocketAddr,
 ) -> Result<Response<Body>, GarageError> {
 	let uri = req.uri().clone();
 	info!("{} {} {}", addr, req.method(), uri);
 	debug!("{:?}", req);
-	match handler_inner(garage.clone(), req).await {
+
+	let tracer = opentelemetry::global::tracer("garage");
+	let span = tracer
+		.span_builder("S3 API call (unknown)")
+		.with_trace_id(gen_trace_id())
+		.with_attributes(vec![
+			KeyValue::new("method", format!("{}", req.method())),
+			KeyValue::new("uri", req.uri().to_string()),
+		])
+		.start(&tracer);
+
+	let res = handler_stage2(garage.clone(), metrics, req)
+		.with_context(Context::current_with_span(span))
+		.await;
+
+	match res {
 		Ok(x) => {
 			debug!("{} {:?}", x.status(), x.headers());
 			Ok(x)
@ -92,11 +155,15 @@ async fn handler(
 	}
 }

-async fn handler_inner(garage: Arc<Garage>, req: Request<Body>) -> Result<Response<Body>, Error> {
+async fn handler_stage2(
+	garage: Arc<Garage>,
+	metrics: Arc<ApiMetrics>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
 	let authority = req
 		.headers()
 		.get(header::HOST)
-		.ok_or_else(|| Error::BadRequest("HOST header required".to_owned()))?
+		.ok_or_bad_request("Host header required")?
 		.to_str()?;

 	let host = authority_to_host(authority)?;
@ -111,15 +178,105 @@ async fn handler_inner(garage: Arc<Garage>, req: Request<Body>) -> Result<Respon
 	let (endpoint, bucket_name) = Endpoint::from_request(&req, bucket_name.map(ToOwned::to_owned))?;
 	debug!("Endpoint: {:?}", endpoint);

-	if let Endpoint::PostObject {} = endpoint {
-		return handle_post_object(garage, req, bucket_name.unwrap()).await;
+	let current_context = Context::current();
+	let current_span = current_context.span();
+	current_span.update_name::<String>(format!("S3 API {}", endpoint.name()));
+	current_span.set_attribute(KeyValue::new("endpoint", endpoint.name()));
+	current_span.set_attribute(KeyValue::new(
+		"bucket",
+		bucket_name.clone().unwrap_or_default(),
+	));
+
+	let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())];
+
+	let res = handler_stage3(garage, req, endpoint, bucket_name)
+		.record_duration(&metrics.request_duration, &metrics_tags[..])
+		.await;
+
+	metrics.request_counter.add(1, &metrics_tags[..]);
+
+	let status_code = match &res {
+		Ok(r) => r.status(),
+		Err(e) => e.http_status_code(),
+	};
+	if status_code.is_client_error() || status_code.is_server_error() {
+		metrics.error_counter.add(
+			1,
+			&[
+				metrics_tags[0].clone(),
+				KeyValue::new("status_code", status_code.as_str().to_string()),
+			],
+		);
 	}

-	let (api_key, content_sha256) = check_payload_signature(&garage, &req).await?;
+	res
+}
+
+async fn handler_stage3(
+	garage: Arc<Garage>,
+	req: Request<Body>,
+	endpoint: Endpoint,
+	bucket_name: Option<String>,
+) -> Result<Response<Body>, Error> {
+	// Some endpoints are processed early, before we even check for an API key
+	if let Endpoint::PostObject = endpoint {
+		return handle_post_object(garage, req, bucket_name.unwrap()).await;
+	}
+	if let Endpoint::Options = endpoint {
+		return handle_options_s3api(garage, &req, bucket_name).await;
+	}
+
+	let (api_key, mut content_sha256) = check_payload_signature(&garage, &req).await?;
 	let api_key = api_key.ok_or_else(|| {
 		Error::Forbidden("Garage does not support anonymous access yet".to_string())
 	})?;

+	let req = match req.headers().get("x-amz-content-sha256") {
+		Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => {
+			let signature = content_sha256
+				.take()
+				.ok_or_bad_request("No signature provided")?;
+
+			let secret_key = &api_key
+				.state
+				.as_option()
+				.ok_or_internal_error("Deleted key state")?
+				.secret_key;
+
+			let date = req
+				.headers()
+				.get("x-amz-date")
+				.ok_or_bad_request("Missing X-Amz-Date field")?
+				.to_str()?;
+			let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME)
+				.ok_or_bad_request("Invalid date")?;
+			let date: DateTime<Utc> = DateTime::from_utc(date, Utc);
+
+			let scope = compute_scope(&date, &garage.config.s3_api.s3_region);
+			let signing_hmac = crate::signature::signing_hmac(
+				&date,
+				secret_key,
+				&garage.config.s3_api.s3_region,
+				"s3",
+			)
+			.ok_or_internal_error("Unable to build signing HMAC")?;
+
+			req.map(move |body| {
+				Body::wrap_stream(
+					SignedPayloadStream::new(
+						body.map_err(Error::from),
+						signing_hmac,
+						date,
+						&scope,
+						signature,
+					)
+					.map_err(Error::from),
+				)
+			})
+		}
+		_ => req,
+	};
+
 	let bucket_name = match bucket_name {
 		None => return handle_request_without_bucket(garage, req, api_key, endpoint).await,
 		Some(bucket) => bucket.to_string(),
@ -161,7 +318,6 @@ async fn handler_inner(garage: Arc<Garage>, req: Request<Body>) -> Result<Respon
 	};

 	let resp = match endpoint {
-		Endpoint::Options => handle_options(&req, &bucket).await,
 		Endpoint::HeadObject {
 			key, part_number, ..
 		} => handle_head(garage, &req, bucket_id, &key, part_number).await,
@ -202,7 +358,7 @@ async fn handler_inner(garage: Arc<Garage>, req: Request<Body>) -> Result<Respon
 			.await
 		}
 		Endpoint::PutObject { key } => {
-			handle_put(garage, req, bucket_id, &key, &api_key, content_sha256).await
+			handle_put(garage, req, bucket_id, &key, content_sha256).await
 		}
 		Endpoint::AbortMultipartUpload { key, upload_id } => {
 			handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await
--- a/src/api/lib.rs
+++ b/src/api/lib.rs
@ -1,6 +1,6 @@
 //! Crate for serving a S3 compatible API
 #[macro_use]
-extern crate log;
+extern crate tracing;

 pub mod error;
 pub use error::Error;
@ -10,7 +10,8 @@ mod encoding;
 mod api_server;
 pub use api_server::run_api_server;

-mod signature;
+/// This mode is public only to help testing. Don't expect stability here
+pub mod signature;

 pub mod helpers;
 mod s3_bucket;
--- a/src/api/s3_cors.rs
+++ b/src/api/s3_cors.rs
@ -100,7 +100,63 @@ pub async fn handle_put_cors(
 		.body(Body::empty())?)
 }

-pub async fn handle_options(req: &Request<Body>, bucket: &Bucket) -> Result<Response<Body>, Error> {
+pub async fn handle_options_s3api(
+	garage: Arc<Garage>,
+	req: &Request<Body>,
+	bucket_name: Option<String>,
+) -> Result<Response<Body>, Error> {
+	// FIXME: CORS rules of buckets with local aliases are
+	// not taken into account.
+
+	// If the bucket name is a global bucket name,
+	// we try to apply the CORS rules of that bucket.
+	// If a user has a local bucket name that has
+	// the same name, its CORS rules won't be applied
+	// and will be shadowed by the rules of the globally
+	// existing bucket (but this is inevitable because
+	// OPTIONS calls are not auhtenticated).
+	if let Some(bn) = bucket_name {
+		let helper = garage.bucket_helper();
+		let bucket_id = helper.resolve_global_bucket_name(&bn).await?;
+		if let Some(id) = bucket_id {
+			let bucket = garage
+				.bucket_table
+				.get(&EmptyKey, &id)
+				.await?
+				.filter(|b| !b.state.is_deleted())
+				.ok_or(Error::NoSuchBucket)?;
+			handle_options_for_bucket(req, &bucket)
+		} else {
+			// If there is a bucket name in the request, but that name
+			// does not correspond to a global alias for a bucket,
+			// then it's either a non-existing bucket or a local bucket.
+			// We have no way of knowing, because the request is not
+			// authenticated and thus we can't resolve local aliases.
+			// We take the permissive approach of allowing everything,
+			// because we don't want to prevent web apps that use
+			// local bucket names from making API calls.
+			Ok(Response::builder()
+				.header(ACCESS_CONTROL_ALLOW_ORIGIN, "*")
+				.header(ACCESS_CONTROL_ALLOW_METHODS, "*")
+				.status(StatusCode::OK)
+				.body(Body::empty())?)
+		}
+	} else {
+		// If there is no bucket name in the request,
+		// we are doing a ListBuckets call, which we want to allow
+		// for all origins.
+		Ok(Response::builder()
+			.header(ACCESS_CONTROL_ALLOW_ORIGIN, "*")
+			.header(ACCESS_CONTROL_ALLOW_METHODS, "GET")
+			.status(StatusCode::OK)
+			.body(Body::empty())?)
+	}
+}
+
+pub fn handle_options_for_bucket(
+	req: &Request<Body>,
+	bucket: &Bucket,
+) -> Result<Response<Body>, Error> {
 	let origin = req
 		.headers()
 		.get("Origin")
@ -144,12 +200,7 @@ pub fn find_matching_cors_rule<'a>(
 				None => vec![],
 			};
 			return Ok(cors_config.iter().find(|rule| {
-				cors_rule_matches(
-					rule,
-					origin,
-					&req.method().to_string(),
-					request_headers.iter(),
-				)
+				cors_rule_matches(rule, origin, req.method().as_ref(), request_headers.iter())
 			}));
 		}
 	}
--- a/src/api/s3_list.rs
+++ b/src/api/s3_list.rs
@ -1042,12 +1042,12 @@ mod tests {

 		query.common.prefix = "a/".to_string();
 		assert_eq!(
-			common_prefix(&objs.get(0).unwrap(), &query.common),
+			common_prefix(objs.get(0).unwrap(), &query.common),
 			Some("a/b/")
 		);

 		query.common.prefix = "a/b/".to_string();
-		assert_eq!(common_prefix(&objs.get(0).unwrap(), &query.common), None);
+		assert_eq!(common_prefix(objs.get(0).unwrap(), &query.common), None);
 	}

 	#[test]
@ -1272,7 +1272,7 @@ mod tests {
 		Version {
 			bucket_id: uuid,
 			key: "a".to_string(),
-			uuid: uuid,
+			uuid,
 			deleted: false.into(),
 			blocks: crdt::Map::<VersionBlockKey, VersionBlock>::from_iter(blocks),
 			parts_etags: crdt::Map::<u64, String>::from_iter(etags),
--- a/src/api/s3_put.rs
+++ b/src/api/s3_put.rs
@ -1,8 +1,7 @@
 use std::collections::{BTreeMap, BTreeSet, VecDeque};
 use std::sync::Arc;

-use chrono::{DateTime, NaiveDateTime, Utc};
-use futures::{prelude::*, TryFutureExt};
+use futures::prelude::*;
 use hyper::body::{Body, Bytes};
 use hyper::header::{HeaderMap, HeaderValue};
 use hyper::{Request, Response};
@ -14,26 +13,22 @@ use garage_util::data::*;
 use garage_util::error::Error as GarageError;
 use garage_util::time::*;

-use garage_model::block::INLINE_THRESHOLD;
+use garage_block::manager::INLINE_THRESHOLD;
 use garage_model::block_ref_table::*;
 use garage_model::garage::Garage;
-use garage_model::key_table::Key;
 use garage_model::object_table::*;
 use garage_model::version_table::*;

 use crate::error::*;
 use crate::s3_xml;
-use crate::signature::streaming::SignedPayloadStream;
-use crate::signature::LONG_DATETIME;
-use crate::signature::{compute_scope, verify_signed_content};
+use crate::signature::verify_signed_content;

 pub async fn handle_put(
 	garage: Arc<Garage>,
 	req: Request<Body>,
 	bucket_id: Uuid,
 	key: &str,
-	api_key: &Key,
-	mut content_sha256: Option<Hash>,
+	content_sha256: Option<Hash>,
 ) -> Result<Response<Body>, Error> {
 	// Retrieve interesting headers from request
 	let headers = get_headers(req.headers())?;
@ -43,52 +38,10 @@ pub async fn handle_put(
 		Some(x) => Some(x.to_str()?.to_string()),
 		None => None,
 	};
-	let payload_seed_signature = match req.headers().get("x-amz-content-sha256") {
-		Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => {
-			let content_sha256 = content_sha256
-				.take()
-				.ok_or_bad_request("No signature provided")?;
-			Some(content_sha256)
-		}
-		_ => None,
-	};

-	// Parse body of uploaded file
-	let (head, body) = req.into_parts();
+	let (_head, body) = req.into_parts();
 	let body = body.map_err(Error::from);

-	let body = if let Some(signature) = payload_seed_signature {
-		let secret_key = &api_key
-			.state
-			.as_option()
-			.ok_or_internal_error("Deleted key state")?
-			.secret_key;
-
-		let date = head
-			.headers
-			.get("x-amz-date")
-			.ok_or_bad_request("Missing X-Amz-Date field")?
-			.to_str()?;
-		let date: NaiveDateTime =
-			NaiveDateTime::parse_from_str(date, LONG_DATETIME).ok_or_bad_request("Invalid date")?;
-		let date: DateTime<Utc> = DateTime::from_utc(date, Utc);
-
-		let scope = compute_scope(&date, &garage.config.s3_api.s3_region);
-		let signing_hmac = crate::signature::signing_hmac(
-			&date,
-			secret_key,
-			&garage.config.s3_api.s3_region,
-			"s3",
-		)
-		.ok_or_internal_error("Unable to build signing HMAC")?;
-
-		SignedPayloadStream::new(body, signing_hmac, date, &scope, signature)?
-			.map_err(Error::from)
-			.boxed()
-	} else {
-		body.boxed()
-	};
-
 	save_stream(
 		garage,
 		headers,
--- a/src/api/s3_router.rs
+++ b/src/api/s3_router.rs
@ -414,8 +414,7 @@ pub enum Endpoint {
 	// It's intended to be used with HTML forms, using a multipart/form-data body.
 	// It works a lot like presigned requests, but everything is in the form instead
 	// of being query parameters of the URL, so authenticating it is a bit different.
-	PostObject {
-	},
+	PostObject,
 }}

 impl Endpoint {
@ -430,8 +429,12 @@ impl Endpoint {
 		let path = uri.path().trim_start_matches('/');
 		let query = uri.query();
 		if bucket.is_none() && path.is_empty() {
+			if *req.method() == Method::OPTIONS {
+				return Ok((Self::Options, None));
+			} else {
 				return Ok((Self::ListBuckets, None));
 			}
+		}

 		let (bucket, key) = if let Some(bucket) = bucket {
 			(bucket, path)
--- a/src/api/s3_website.rs
+++ b/src/api/s3_website.rs
@ -259,8 +259,7 @@ impl RoutingRuleInner {
 		let has_prefix = self
 			.condition
 			.as_ref()
-			.map(|c| c.prefix.as_ref())
-			.flatten()
+			.and_then(|c| c.prefix.as_ref())
 			.is_some();
 		self.redirect.validate(has_prefix)
 	}
--- a/src/api/signature/payload.rs
+++ b/src/api/signature/payload.rs
@ -51,8 +51,7 @@ pub async fn check_payload_signature(

 	let canonical_request = canonical_request(
 		request.method(),
-		&request.uri().path().to_string(),
-		&canonical_query_string(request.uri()),
+		request.uri(),
 		&headers,
 		&authorization.signed_headers,
 		&authorization.content_sha256,
@ -60,6 +59,9 @@ pub async fn check_payload_signature(
 	let (_, scope) = parse_credential(&authorization.credential)?;
 	let string_to_sign = string_to_sign(&authorization.date, &scope, &canonical_request);

+	trace!("canonical request:\n{}", canonical_request);
+	trace!("string to sign:\n{}", string_to_sign);
+
 	let key = verify_v4(
 		garage,
 		&authorization.credential,
@ -212,7 +214,7 @@ fn parse_credential(cred: &str) -> Result<(String, String), Error> {
 	))
 }

-fn string_to_sign(datetime: &DateTime<Utc>, scope_string: &str, canonical_req: &str) -> String {
+pub fn string_to_sign(datetime: &DateTime<Utc>, scope_string: &str, canonical_req: &str) -> String {
 	let mut hasher = Sha256::default();
 	hasher.update(canonical_req.as_bytes());
 	[
@ -224,18 +226,17 @@ fn string_to_sign(datetime: &DateTime<Utc>, scope_string: &str, canonical_req: &
 	.join("\n")
 }

-fn canonical_request(
+pub fn canonical_request(
 	method: &Method,
-	url_path: &str,
-	canonical_query_string: &str,
+	uri: &hyper::Uri,
 	headers: &HashMap<String, String>,
 	signed_headers: &str,
 	content_sha256: &str,
 ) -> String {
 	[
 		method.as_str(),
-		url_path,
-		canonical_query_string,
+		uri.path(),
+		&canonical_query_string(uri),
 		&canonical_header_string(headers, signed_headers),
 		"",
 		signed_headers,
--- a/src/api/signature/streaming.rs
+++ b/src/api/signature/streaming.rs
@ -164,15 +164,15 @@ where
 		datetime: DateTime<Utc>,
 		scope: &str,
 		seed_signature: Hash,
-	) -> Result<Self, Error> {
-		Ok(Self {
+	) -> Self {
+		Self {
 			stream,
 			buf: bytes::BytesMut::new(),
 			datetime,
 			scope: scope.into(),
 			signing_hmac,
 			previous_signature: seed_signature,
-		})
+		}
 	}

 	fn parse_next(input: &[u8]) -> nom::IResult<&[u8], SignedPayload, SignedPayloadStreamError> {
@ -305,7 +305,7 @@ mod tests {
 		let seed_signature = Hash::default();

 		let mut stream =
-			SignedPayloadStream::new(body, signing_hmac, datetime, &scope, seed_signature).unwrap();
+			SignedPayloadStream::new(body, signing_hmac, datetime, &scope, seed_signature);

 		assert!(stream.try_next().await.is_err());
 		match stream.try_next().await {
--- a/src/block/Cargo.toml
+++ b/src/block/Cargo.toml
@ -0,0 +1,38 @@
+[package]
+name = "garage_block"
+version = "0.7.0"
+authors = ["Alex Auvolat <alex@adnab.me>"]
+edition = "2018"
+license = "AGPL-3.0"
+description = "Block manager for the Garage object store"
+repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+readme = "../../README.md"
+
+[lib]
+path = "lib.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+garage_rpc = { version = "0.7.0", path = "../rpc" }
+garage_util = { version = "0.7.0", path = "../util" }
+garage_table = { version = "0.7.0", path = "../table" }
+
+opentelemetry = "0.17"
+
+async-trait = "0.1.7"
+bytes = "1.0"
+hex = "0.4"
+tracing = "0.1.30"
+rand = "0.8"
+zstd = { version = "0.9", default-features = false }
+
+sled = "0.34"
+
+rmp-serde = "0.15"
+serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
+serde_bytes = "0.11"
+
+futures = "0.3"
+futures-util = "0.3"
+tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
--- a/src/block/block.rs
+++ b/src/block/block.rs
@ -0,0 +1,81 @@
+use serde::{Deserialize, Serialize};
+use zstd::stream::{decode_all as zstd_decode, Encoder};
+
+use garage_util::data::*;
+use garage_util::error::*;
+
+/// A possibly compressed block of data
+#[derive(Debug, Serialize, Deserialize)]
+pub enum DataBlock {
+	/// Uncompressed data
+	Plain(#[serde(with = "serde_bytes")] Vec<u8>),
+	/// Data compressed with zstd
+	Compressed(#[serde(with = "serde_bytes")] Vec<u8>),
+}
+
+impl DataBlock {
+	/// Query whether this block is compressed
+	pub fn is_compressed(&self) -> bool {
+		matches!(self, DataBlock::Compressed(_))
+	}
+
+	/// Get the inner, possibly compressed buffer. You should probably use [`DataBlock::verify_get`]
+	/// instead
+	pub fn inner_buffer(&self) -> &[u8] {
+		use DataBlock::*;
+		let (Plain(ref res) | Compressed(ref res)) = self;
+		res
+	}
+
+	/// Get the buffer, possibly decompressing it, and verify it's integrity.
+	/// For Plain block, data is compared to hash, for Compressed block, zstd checksumming system
+	/// is used instead.
+	pub fn verify_get(self, hash: Hash) -> Result<Vec<u8>, Error> {
+		match self {
+			DataBlock::Plain(data) => {
+				if blake2sum(&data) == hash {
+					Ok(data)
+				} else {
+					Err(Error::CorruptData(hash))
+				}
+			}
+			DataBlock::Compressed(data) => {
+				zstd_decode(&data[..]).map_err(|_| Error::CorruptData(hash))
+			}
+		}
+	}
+
+	/// Verify data integrity. Allocate less than [`DataBlock::verify_get`] and don't consume self, but
+	/// does not return the buffer content.
+	pub fn verify(&self, hash: Hash) -> Result<(), Error> {
+		match self {
+			DataBlock::Plain(data) => {
+				if blake2sum(data) == hash {
+					Ok(())
+				} else {
+					Err(Error::CorruptData(hash))
+				}
+			}
+			DataBlock::Compressed(data) => zstd::stream::copy_decode(&data[..], std::io::sink())
+				.map_err(|_| Error::CorruptData(hash)),
+		}
+	}
+
+	pub fn from_buffer(data: Vec<u8>, level: Option<i32>) -> DataBlock {
+		if let Some(level) = level {
+			if let Ok(data) = zstd_encode(&data[..], level) {
+				return DataBlock::Compressed(data);
+			}
+		}
+		DataBlock::Plain(data)
+	}
+}
+
+fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> {
+	let mut result = Vec::<u8>::new();
+	let mut encoder = Encoder::new(&mut result, level)?;
+	encoder.include_checksum(true)?;
+	std::io::copy(&mut source, &mut encoder)?;
+	encoder.finish()?;
+	Ok(result)
+}
--- a/src/block/lib.rs
+++ b/src/block/lib.rs
@ -0,0 +1,8 @@
+#[macro_use]
+extern crate tracing;
+
+pub mod manager;
+
+mod block;
+mod metrics;
+mod rc;
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@ -3,18 +3,24 @@ use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::Duration;

-use arc_swap::ArcSwapOption;
 use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
 use futures::future::*;
 use futures::select;
-use serde::{Deserialize, Serialize};
 use tokio::fs;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tokio::sync::{watch, Mutex, Notify};
-use zstd::stream::{decode_all as zstd_decode, Encoder};
+
+use opentelemetry::{
+	trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
+	Context, KeyValue,
+};

 use garage_util::data::*;
 use garage_util::error::*;
+use garage_util::metrics::RecordDuration;
+use garage_util::sled_counter::SledCountedTree;
 use garage_util::time::*;
 use garage_util::tranquilizer::Tranquilizer;

@ -23,16 +29,13 @@ use garage_rpc::*;

 use garage_table::replication::{TableReplication, TableShardedReplication};

-use crate::block_ref_table::*;
-
-use crate::garage::Garage;
+use crate::block::*;
+use crate::metrics::*;
+use crate::rc::*;

 /// Size under which data will be stored inlined in database instead of as files
 pub const INLINE_THRESHOLD: usize = 3072;

-pub const BACKGROUND_WORKERS: u64 = 1;
-pub const BACKGROUND_TRANQUILITY: u32 = 3;
-
 // Timeout for RPCs that read and write blocks to remote nodes
 const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
 // Timeout for RPCs that ask other nodes whether they need a copy
@ -40,13 +43,17 @@ const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
 const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);

 // The delay between the time where a resync operation fails
-// and the time when it is retried.
+// and the time when it is retried, with exponential backoff
+// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure).
 const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
+// The minimum retry delay is 60 seconds = 1 minute
+// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour)
+const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6;

 // The delay between the moment when the reference counter
 // drops to zero, and the moment where we allow ourselves
 // to delete the block locally.
-const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
+pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);

 /// RPC messages used to share blocks of data between nodes
 #[derive(Debug, Serialize, Deserialize)]
@ -66,73 +73,6 @@ pub enum BlockRpc {
 	NeedBlockReply(bool),
 }

-/// A possibly compressed block of data
-#[derive(Debug, Serialize, Deserialize)]
-pub enum DataBlock {
-	/// Uncompressed data
-	Plain(#[serde(with = "serde_bytes")] Vec<u8>),
-	/// Data compressed with zstd
-	Compressed(#[serde(with = "serde_bytes")] Vec<u8>),
-}
-
-impl DataBlock {
-	/// Query whether this block is compressed
-	pub fn is_compressed(&self) -> bool {
-		matches!(self, DataBlock::Compressed(_))
-	}
-
-	/// Get the inner, possibly compressed buffer. You should probably use [`DataBlock::verify_get`]
-	/// instead
-	pub fn inner_buffer(&self) -> &[u8] {
-		use DataBlock::*;
-		let (Plain(ref res) | Compressed(ref res)) = self;
-		res
-	}
-
-	/// Get the buffer, possibly decompressing it, and verify it's integrity.
-	/// For Plain block, data is compared to hash, for Compressed block, zstd checksumming system
-	/// is used instead.
-	pub fn verify_get(self, hash: Hash) -> Result<Vec<u8>, Error> {
-		match self {
-			DataBlock::Plain(data) => {
-				if blake2sum(&data) == hash {
-					Ok(data)
-				} else {
-					Err(Error::CorruptData(hash))
-				}
-			}
-			DataBlock::Compressed(data) => {
-				zstd_decode(&data[..]).map_err(|_| Error::CorruptData(hash))
-			}
-		}
-	}
-
-	/// Verify data integrity. Allocate less than [`DataBlock::verify_get`] and don't consume self, but
-	/// does not return the buffer content.
-	pub fn verify(&self, hash: Hash) -> Result<(), Error> {
-		match self {
-			DataBlock::Plain(data) => {
-				if blake2sum(data) == hash {
-					Ok(())
-				} else {
-					Err(Error::CorruptData(hash))
-				}
-			}
-			DataBlock::Compressed(data) => zstd::stream::copy_decode(&data[..], std::io::sink())
-				.map_err(|_| Error::CorruptData(hash)),
-		}
-	}
-
-	pub fn from_buffer(data: Vec<u8>, level: Option<i32>) -> DataBlock {
-		if let Some(level) = level {
-			if let Ok(data) = zstd_encode(&data[..], level) {
-				return DataBlock::Compressed(data);
-			}
-		}
-		DataBlock::Plain(data)
-	}
-}
-
 impl Rpc for BlockRpc {
 	type Response = Result<BlockRpc, Error>;
 }
@ -144,16 +84,21 @@ pub struct BlockManager {
 	/// Directory in which block are stored
 	pub data_dir: PathBuf,

+	compression_level: Option<i32>,
+	background_tranquility: u32,
+
 	mutation_lock: Mutex<BlockManagerLocked>,

-	rc: sled::Tree,
+	rc: BlockRc,

-	resync_queue: sled::Tree,
+	resync_queue: SledCountedTree,
 	resync_notify: Notify,
+	resync_errors: SledCountedTree,

 	system: Arc<System>,
 	endpoint: Arc<Endpoint<BlockRpc, Self>>,
-	pub(crate) garage: ArcSwapOption<Garage>,
+
+	metrics: BlockManagerMetrics,
 }

 // This custom struct contains functions that must only be ran
@ -165,16 +110,25 @@ impl BlockManager {
 	pub fn new(
 		db: &sled::Db,
 		data_dir: PathBuf,
+		compression_level: Option<i32>,
+		background_tranquility: u32,
 		replication: TableShardedReplication,
 		system: Arc<System>,
 	) -> Arc<Self> {
 		let rc = db
 			.open_tree("block_local_rc")
 			.expect("Unable to open block_local_rc tree");
+		let rc = BlockRc::new(rc);

 		let resync_queue = db
 			.open_tree("block_local_resync_queue")
 			.expect("Unable to open block_local_resync_queue tree");
+		let resync_queue = SledCountedTree::new(resync_queue);
+
+		let resync_errors = db
+			.open_tree("block_local_resync_errors")
+			.expect("Unable to open block_local_resync_errors tree");
+		let resync_errors = SledCountedTree::new(resync_errors);

 		let endpoint = system
 			.netapp
@ -182,19 +136,26 @@ impl BlockManager {

 		let manager_locked = BlockManagerLocked();

+		let metrics = BlockManagerMetrics::new(resync_queue.clone(), resync_errors.clone());
+
 		let block_manager = Arc::new(Self {
 			replication,
 			data_dir,
+			compression_level,
+			background_tranquility,
 			mutation_lock: Mutex::new(manager_locked),
 			rc,
 			resync_queue,
 			resync_notify: Notify::new(),
+			resync_errors,
 			system,
 			endpoint,
-			garage: ArcSwapOption::from(None),
+			metrics,
 		});
 		block_manager.endpoint.set_handler(block_manager.clone());

+		block_manager.clone().spawn_background_worker();
+
 		block_manager
 	}

@ -236,14 +197,7 @@ impl BlockManager {
 	/// Send block to nodes that should have it
 	pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
 		let who = self.replication.write_nodes(&hash);
-		let compression_level = self
-			.garage
-			.load()
-			.as_ref()
-			.unwrap()
-			.config
-			.compression_level;
-		let data = DataBlock::from_buffer(data, compression_level);
+		let data = DataBlock::from_buffer(data, self.compression_level);
 		self.system
 			.rpc
 			.try_call_many(
@ -265,18 +219,10 @@ impl BlockManager {
 	/// to fix any mismatch between the two.
 	pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
 		// 1. Repair blocks from RC table.
-		let garage = self.garage.load_full().unwrap();
-		let mut last_hash = None;
-		for (i, entry) in garage.block_ref_table.data.store.iter().enumerate() {
-			let (_k, v_bytes) = entry?;
-			let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?;
-			if Some(&block_ref.block) == last_hash.as_ref() {
-				continue;
-			}
-			if !block_ref.deleted.get() {
-				last_hash = Some(block_ref.block);
-				self.put_to_resync(&block_ref.block, Duration::from_secs(0))?;
-			}
+		for (i, entry) in self.rc.rc.iter().enumerate() {
+			let (hash, _) = entry?;
+			let hash = Hash::try_from(&hash[..]).unwrap();
+			self.put_to_resync(&hash, Duration::from_secs(0))?;
 			if i & 0xFF == 0 && *must_exit.borrow() {
 				return Ok(());
 			}
@ -288,7 +234,10 @@ impl BlockManager {
 		// so that we can offload them if necessary and then delete them locally.
 		self.for_each_file(
 			(),
-			move |_, hash| async move { self.put_to_resync(&hash, Duration::from_secs(0)) },
+			move |_, hash| async move {
+				self.put_to_resync(&hash, Duration::from_secs(0))
+					.map_err(Into::into)
+			},
 			must_exit,
 		)
 		.await
@ -319,9 +268,14 @@ impl BlockManager {
 		self.resync_queue.len()
 	}

+	/// Get number of blocks that have an error
+	pub fn resync_errors_len(&self) -> usize {
+		self.resync_errors.len()
+	}
+
 	/// Get number of items in the refcount table
 	pub fn rc_len(&self) -> usize {
-		self.rc.len()
+		self.rc.rc.len()
 	}

 	//// ----- Managing the reference counter ----
@ -329,11 +283,7 @@ impl BlockManager {
 	/// Increment the number of time a block is used, putting it to resynchronization if it is
 	/// required, but not known
 	pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
-		let old_rc = self
-			.rc
-			.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
-		let old_rc = RcEntry::parse_opt(old_rc);
-		if old_rc.is_zero() {
+		if self.rc.block_incref(hash)? {
 			// When the reference counter is incremented, there is
 			// normally a node that is responsible for sending us the
 			// data of the block. However that operation may fail,
@ -347,48 +297,51 @@ impl BlockManager {

 	/// Decrement the number of time a block is used
 	pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
-		let new_rc = self
-			.rc
-			.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
-		let new_rc = RcEntry::parse_opt(new_rc);
-		if let RcEntry::Deletable { .. } = new_rc {
+		if self.rc.block_decref(hash)? {
+			// When the RC is decremented, it might drop to zero,
+			// indicating that we don't need the block.
+			// There is a delay before we garbage collect it;
+			// make sure that it is handled in the resync loop
+			// after that delay has passed.
 			self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?;
 		}
 		Ok(())
 	}

-	/// Read a block's reference count
-	fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
-		Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
-	}
-
-	/// Delete an entry in the RC table if it is deletable and the
-	/// deletion time has passed
-	fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
-		let now = now_msec();
-		self.rc.update_and_fetch(&hash, |rcval| {
-			let updated = match RcEntry::parse_opt(rcval) {
-				RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
-				v => v,
-			};
-			updated.serialize()
-		})?;
-		Ok(())
-	}
-
 	// ---- Reading and writing blocks locally ----

 	/// Write a block to disk
 	async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<BlockRpc, Error> {
-		self.mutation_lock
+		let write_size = data.inner_buffer().len() as u64;
+
+		let res = self
+			.mutation_lock
 			.lock()
 			.await
 			.write_block(hash, data, self)
-			.await
+			.bound_record_duration(&self.metrics.block_write_duration)
+			.await?;
+
+		self.metrics.bytes_written.add(write_size);
+
+		Ok(res)
 	}

 	/// Read block from disk, verifying it's integrity
 	async fn read_block(&self, hash: &Hash) -> Result<BlockRpc, Error> {
+		let data = self
+			.read_block_internal(hash)
+			.bound_record_duration(&self.metrics.block_read_duration)
+			.await?;
+
+		self.metrics
+			.bytes_read
+			.add(data.inner_buffer().len() as u64);
+
+		Ok(BlockRpc::PutBlock { hash: *hash, data })
+	}
+
+	async fn read_block_internal(&self, hash: &Hash) -> Result<DataBlock, Error> {
 		let mut path = self.block_path(hash);
 		let compressed = match self.is_block_compressed(hash).await {
 			Ok(c) => c,
@ -414,6 +367,8 @@ impl BlockManager {
 		};

 		if data.verify(*hash).is_err() {
+			self.metrics.corruption_counter.add(1);
+
 			self.mutation_lock
 				.lock()
 				.await
@ -423,7 +378,7 @@ impl BlockManager {
 			return Err(Error::CorruptData(*hash));
 		}

-		Ok(BlockRpc::PutBlock { hash: *hash, data })
+		Ok(data)
 	}

 	/// Check if this node should have a block, but don't actually have it
@ -466,22 +421,94 @@ impl BlockManager {

 	// ---- Resync loop ----

-	pub fn spawn_background_worker(self: Arc<Self>) {
-		// Launch n simultaneous workers for background resync loop preprocessing
-		for i in 0..BACKGROUND_WORKERS {
-			let bm2 = self.clone();
+	// This part manages a queue of blocks that need to be
+	// "resynchronized", i.e. that need to have a check that
+	// they are at present if we need them, or that they are
+	// deleted once the garbage collection delay has passed.
+	//
+	// Here are some explanations on how the resync queue works.
+	// There are two Sled trees that are used to have information
+	// about the status of blocks that need to be resynchronized:
+	//
+	// - resync_queue: a tree that is ordered first by a timestamp
+	//   (in milliseconds since Unix epoch) that is the time at which
+	//   the resync must be done, and second by block hash.
+	//   The key in this tree is just:
+	//       concat(timestamp (8 bytes), hash (32 bytes))
+	//   The value is the same 32-byte hash.
+	//
+	// - resync_errors: a tree that indicates for each block
+	//   if the last resync resulted in an error, and if so,
+	//   the following two informations (see the ErrorCounter struct):
+	//   - how many consecutive resync errors for this block?
+	//   - when was the last try?
+	//   These two informations are used to implement an
+	//   exponential backoff retry strategy.
+	//   The key in this tree is the 32-byte hash of the block,
+	//   and the value is the encoded ErrorCounter value.
+	//
+	// We need to have these two trees, because the resync queue
+	// is not just a queue of items to process, but a set of items
+	// that are waiting a specific delay until we can process them
+	// (the delay being necessary both internally for the exponential
+	// backoff strategy, and exposed as a parameter when adding items
+	// to the queue, e.g. to wait until the GC delay has passed).
+	// This is why we need one tree ordered by time, and one
+	// ordered by identifier of item to be processed (block hash).
+	//
+	// When the worker wants to process an item it takes from
+	// resync_queue, it checks in resync_errors that if there is an
+	// exponential back-off delay to await, it has passed before we
+	// process the item. If not, the item in the queue is skipped
+	// (but added back for later processing after the time of the
+	// delay).
+	//
+	// An alternative that would have seemed natural is to
+	// only add items to resync_queue with a processing time that is
+	// after the delay, but there are several issues with this:
+	// - This requires to synchronize updates to resync_queue and
+	//   resync_errors (with the current model, there is only one thread,
+	//   the worker thread, that accesses resync_errors,
+	//   so no need to synchronize) by putting them both in a lock.
+	//   This would mean that block_incref might need to take a lock
+	//   before doing its thing, meaning it has much more chances of
+	//   not completing successfully if something bad happens to Garage.
+	//   Currently Garage is not able to recover from block_incref that
+	//   doesn't complete successfully, because it is necessary to ensure
+	//   the consistency between the state of the block manager and
+	//   information in the BlockRef table.
+	// - If a resync fails, we put that block in the resync_errors table,
+	//   and also add it back to resync_queue to be processed after
+	//   the exponential back-off delay,
+	//   but maybe the block is already scheduled to be resynced again
+	//   at another time that is before the exponential back-off delay,
+	//   and we have no way to check that easily. This means that
+	//   in all cases, we need to check the resync_errors table
+	//   in the resync loop at the time when a block is popped from
+	//   the resync_queue.
+	// Overall, the current design is therefore simpler and more robust
+	// because it tolerates inconsistencies between the resync_queue
+	// and resync_errors table (items being scheduled in resync_queue
+	// for times that are earlier than the exponential back-off delay
+	// is a natural condition that is handled properly).
+
+	fn spawn_background_worker(self: Arc<Self>) {
+		// Launch a background workers for background resync loop processing
 		let background = self.system.background.clone();
 		tokio::spawn(async move {
-				tokio::time::sleep(Duration::from_secs(10 * (i + 1))).await;
-				background.spawn_worker(format!("block resync worker {}", i), move |must_exit| {
-					bm2.resync_loop(must_exit)
+			tokio::time::sleep(Duration::from_secs(10)).await;
+			background.spawn_worker("block resync worker".into(), move |must_exit| {
+				self.resync_loop(must_exit)
 			});
 		});
 	}
-	}

-	fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), Error> {
+	fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), sled::Error> {
 		let when = now_msec() + delay.as_millis() as u64;
+		self.put_to_resync_at(hash, when)
+	}
+
+	fn put_to_resync_at(&self, hash: &Hash, when: u64) -> Result<(), sled::Error> {
 		trace!("Put resync_queue: {} {:?}", when, hash);
 		let mut key = u64::to_be_bytes(when).to_vec();
 		key.extend(hash.as_ref());
@ -496,7 +523,7 @@ impl BlockManager {
 		while !*must_exit.borrow() {
 			match self.resync_iter(&mut must_exit).await {
 				Ok(true) => {
-					tranquilizer.tranquilize(BACKGROUND_TRANQUILITY).await;
+					tranquilizer.tranquilize(self.background_tranquility).await;
 				}
 				Ok(false) => {
 					tranquilizer.reset();
@ -516,20 +543,84 @@ impl BlockManager {
 		}
 	}

-	async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<bool, Error> {
-		if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? {
+	// The result of resync_iter is:
+	// - Ok(true) -> a block was processed (successfully or not)
+	// - Ok(false) -> no block was processed, but we are ready for the next iteration
+	// - Err(_) -> a Sled error occurred when reading/writing from resync_queue/resync_errors
+	async fn resync_iter(
+		&self,
+		must_exit: &mut watch::Receiver<bool>,
+	) -> Result<bool, sled::Error> {
+		if let Some(first_pair_res) = self.resync_queue.iter().next() {
+			let (time_bytes, hash_bytes) = first_pair_res?;
+
 			let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap());
 			let now = now_msec();
+
 			if now >= time_msec {
 				let hash = Hash::try_from(&hash_bytes[..]).unwrap();
-				let res = self.resync_block(&hash).await;
-				if let Err(e) = &res {
-					warn!("Error when resyncing {:?}: {}", hash, e);
-					self.put_to_resync(&hash, RESYNC_RETRY_DELAY)?;
+
+				if let Some(ec) = self.resync_errors.get(hash.as_slice())? {
+					let ec = ErrorCounter::decode(ec);
+					if now < ec.next_try() {
+						// if next retry after an error is not yet,
+						// don't do resync and return early, but still
+						// make sure the item is still in queue at expected time
+						self.put_to_resync_at(&hash, ec.next_try())?;
+						// ec.next_try() > now >= time_msec, so this remove
+						// is not removing the one we added just above
+						// (we want to do the remove after the insert to ensure
+						// that the item is not lost if we crash in-between)
+						self.resync_queue.remove(time_bytes)?;
+						return Ok(false);
 					}
+				}
+
+				let tracer = opentelemetry::global::tracer("garage");
+				let trace_id = gen_uuid();
+				let span = tracer
+					.span_builder("Resync block")
+					.with_trace_id(
+						opentelemetry::trace::TraceId::from_hex(&hex::encode(
+							&trace_id.as_slice()[..16],
+						))
+						.unwrap(),
+					)
+					.with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))])
+					.start(&tracer);
+
+				let res = self
+					.resync_block(&hash)
+					.with_context(Context::current_with_span(span))
+					.bound_record_duration(&self.metrics.resync_duration)
+					.await;
+
+				self.metrics.resync_counter.add(1);
+
+				if let Err(e) = &res {
+					self.metrics.resync_error_counter.add(1);
+					warn!("Error when resyncing {:?}: {}", hash, e);
+
+					let err_counter = match self.resync_errors.get(hash.as_slice())? {
+						Some(ec) => ErrorCounter::decode(ec).add1(now + 1),
+						None => ErrorCounter::new(now + 1),
+					};
+
+					self.resync_errors
+						.insert(hash.as_slice(), err_counter.encode())?;
+
+					self.put_to_resync_at(&hash, err_counter.next_try())?;
+					// err_counter.next_try() >= now + 1 > now,
+					// the entry we remove from the queue is not
+					// the entry we inserted with put_to_resync_at
+					self.resync_queue.remove(time_bytes)?;
+				} else {
+					self.resync_errors.remove(hash.as_slice())?;
+					self.resync_queue.remove(time_bytes)?;
+				}
+
 				Ok(true)
 			} else {
-				self.resync_queue.insert(time_bytes, hash_bytes)?;
 				let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
 				select! {
 					_ = delay.fuse() => {},
@ -539,7 +630,15 @@ impl BlockManager {
 				Ok(false)
 			}
 		} else {
+			// Here we wait either for a notification that an item has been
+			// added to the queue, or for a constant delay of 10 secs to expire.
+			// The delay avoids a race condition where the notification happens
+			// between the time we checked the queue and the first poll
+			// to resync_notify.notified(): if that happens, we'll just loop
+			// back 10 seconds later, which is fine.
+			let delay = tokio::time::sleep(Duration::from_secs(10));
 			select! {
+				_ = delay.fuse() => {},
 				_ = self.resync_notify.notified().fuse() => {},
 				_ = must_exit.changed().fuse() => {},
 			}
@ -607,6 +706,12 @@ impl BlockManager {
 					need_nodes.len()
 				);

+				for node in need_nodes.iter() {
+					self.metrics
+						.resync_send_counter
+						.add(1, &[KeyValue::new("to", format!("{:?}", node))]);
+				}
+
 				let put_block_message = self.read_block(hash).await?;
 				self.system
 					.rpc
@ -634,7 +739,7 @@ impl BlockManager {
 				.delete_if_unneeded(hash, self)
 				.await?;

-			self.clear_deleted_block_rc(hash)?;
+			self.rc.clear_deleted_block_rc(hash)?;
 		}

 		if needed.is_nonzero() && !exists {
@ -644,6 +749,9 @@ impl BlockManager {
 			);

 			let block_data = self.rpc_get_raw_block(hash).await?;
+
+			self.metrics.resync_recv_counter.add(1);
+
 			self.write_block(hash, &block_data).await?;
 		}

@ -745,7 +853,7 @@ impl BlockManagerLocked {
 		mgr: &BlockManager,
 	) -> Result<BlockStatus, Error> {
 		let exists = mgr.is_block_compressed(hash).await.is_ok();
-		let needed = mgr.get_block_rc(hash)?;
+		let needed = mgr.rc.get_block_rc(hash)?;

 		Ok(BlockStatus { exists, needed })
 	}
@ -760,9 +868,11 @@ impl BlockManagerLocked {
 		let data = data.inner_buffer();

 		let mut path = mgr.block_dir(hash);
-		fs::create_dir_all(&path).await?;
-
+		let directory = path.clone();
 		path.push(hex::encode(hash));
+
+		fs::create_dir_all(&directory).await?;
+
 		let to_delete = match (mgr.is_block_compressed(hash).await, compressed) {
 			(Ok(true), _) => return Ok(BlockRpc::Ok),
 			(Ok(false), false) => return Ok(BlockRpc::Ok),
@ -783,6 +893,7 @@ impl BlockManagerLocked {
 		path2.set_extension("tmp");
 		let mut f = fs::File::create(&path2).await?;
 		f.write_all(data).await?;
+		f.sync_all().await?;
 		drop(f);

 		fs::rename(path2, path).await?;
@ -790,6 +901,19 @@ impl BlockManagerLocked {
 			fs::remove_file(to_delete).await?;
 		}

+		// We want to ensure that when this function returns, data is properly persisted
+		// to disk. The first step is the sync_all above that does an fsync on the data file.
+		// Now, we do an fsync on the containing directory, to ensure that the rename
+		// is persisted properly. See:
+		// http://thedjbway.b0llix.net/qmail/syncdir.html
+		let dir = fs::OpenOptions::new()
+			.read(true)
+			.mode(0)
+			.open(directory)
+			.await?;
+		dir.sync_all().await?;
+		drop(dir);
+
 		Ok(BlockRpc::Ok)
 	}

@ -819,117 +943,55 @@ impl BlockManagerLocked {
 				path.set_extension("zst");
 			}
 			fs::remove_file(path).await?;
+			mgr.metrics.delete_counter.add(1);
 		}
 		Ok(())
 	}
 }

-/// Describes the state of the reference counter for a block
+/// Counts the number of errors when resyncing a block,
+/// and the time of the last try.
+/// Used to implement exponential backoff.
 #[derive(Clone, Copy, Debug)]
-enum RcEntry {
-	/// Present: the block has `count` references, with `count` > 0.
-	///
-	/// This is stored as u64::to_be_bytes(count)
-	Present { count: u64 },
-
-	/// Deletable: the block has zero references, and can be deleted
-	/// once time (returned by now_msec) is larger than at_time
-	/// (in millis since Unix epoch)
-	///
-	/// This is stored as [0u8; 8] followed by u64::to_be_bytes(at_time),
-	/// (this allows for the data format to be backwards compatible with
-	/// previous Garage versions that didn't have this intermediate state)
-	Deletable { at_time: u64 },
-
-	/// Absent: the block has zero references, and can be deleted
-	/// immediately
-	Absent,
+struct ErrorCounter {
+	errors: u64,
+	last_try: u64,
 }

-impl RcEntry {
-	fn parse(bytes: &[u8]) -> Self {
-		if bytes.len() == 8 {
-			RcEntry::Present {
-				count: u64::from_be_bytes(bytes.try_into().unwrap()),
-			}
-		} else if bytes.len() == 16 {
-			RcEntry::Deletable {
-				at_time: u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
-			}
-		} else {
-			panic!("Invalid RC entry: {:?}, database is corrupted. This is an error Garage is currently unable to recover from. Sorry, and also please report a bug.",
-				bytes
-			)
+impl ErrorCounter {
+	fn new(now: u64) -> Self {
+		Self {
+			errors: 1,
+			last_try: now,
 		}
 	}

-	fn parse_opt<V: AsRef<[u8]>>(bytes: Option<V>) -> Self {
-		bytes
-			.map(|b| Self::parse(b.as_ref()))
-			.unwrap_or(Self::Absent)
+	fn decode(data: sled::IVec) -> Self {
+		Self {
+			errors: u64::from_be_bytes(data[0..8].try_into().unwrap()),
+			last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()),
+		}
+	}
+	fn encode(&self) -> Vec<u8> {
+		[
+			u64::to_be_bytes(self.errors),
+			u64::to_be_bytes(self.last_try),
+		]
+		.concat()
 	}

-	fn serialize(self) -> Option<Vec<u8>> {
-		match self {
-			RcEntry::Present { count } => Some(u64::to_be_bytes(count).to_vec()),
-			RcEntry::Deletable { at_time } => {
-				Some([u64::to_be_bytes(0), u64::to_be_bytes(at_time)].concat())
-			}
-			RcEntry::Absent => None,
+	fn add1(self, now: u64) -> Self {
+		Self {
+			errors: self.errors + 1,
+			last_try: now,
 		}
 	}

-	fn increment(self) -> Self {
-		let old_count = match self {
-			RcEntry::Present { count } => count,
-			_ => 0,
-		};
-		RcEntry::Present {
-			count: old_count + 1,
+	fn delay_msec(&self) -> u64 {
+		(RESYNC_RETRY_DELAY.as_millis() as u64)
+			<< std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER)
+	}
+	fn next_try(&self) -> u64 {
+		self.last_try + self.delay_msec()
 	}
 }
-
-	fn decrement(self) -> Self {
-		match self {
-			RcEntry::Present { count } => {
-				if count > 1 {
-					RcEntry::Present { count: count - 1 }
-				} else {
-					RcEntry::Deletable {
-						at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
-					}
-				}
-			}
-			del => del,
-		}
-	}
-
-	fn is_zero(&self) -> bool {
-		matches!(self, RcEntry::Deletable { .. } | RcEntry::Absent)
-	}
-
-	fn is_nonzero(&self) -> bool {
-		!self.is_zero()
-	}
-
-	fn is_deletable(&self) -> bool {
-		match self {
-			RcEntry::Present { .. } => false,
-			RcEntry::Deletable { at_time } => now_msec() > *at_time,
-			RcEntry::Absent => true,
-		}
-	}
-
-	fn is_needed(&self) -> bool {
-		!self.is_deletable()
-	}
-}
-
-fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> {
-	let mut result = Vec::<u8>::new();
-	let mut encoder = Encoder::new(&mut result, level)?;
-	encoder.include_checksum(true)?;
-	std::io::copy(&mut source, &mut encoder)?;
-	encoder.finish()?;
-	Ok(result)
-}
--- a/src/block/metrics.rs
+++ b/src/block/metrics.rs
@ -0,0 +1,102 @@
+use opentelemetry::{global, metrics::*};
+
+use garage_util::sled_counter::SledCountedTree;
+
+/// TableMetrics reference all counter used for metrics
+pub struct BlockManagerMetrics {
+	pub(crate) _resync_queue_len: ValueObserver<u64>,
+	pub(crate) _resync_errored_blocks: ValueObserver<u64>,
+
+	pub(crate) resync_counter: BoundCounter<u64>,
+	pub(crate) resync_error_counter: BoundCounter<u64>,
+	pub(crate) resync_duration: BoundValueRecorder<f64>,
+	pub(crate) resync_send_counter: Counter<u64>,
+	pub(crate) resync_recv_counter: BoundCounter<u64>,
+
+	pub(crate) bytes_read: BoundCounter<u64>,
+	pub(crate) block_read_duration: BoundValueRecorder<f64>,
+	pub(crate) bytes_written: BoundCounter<u64>,
+	pub(crate) block_write_duration: BoundValueRecorder<f64>,
+	pub(crate) delete_counter: BoundCounter<u64>,
+
+	pub(crate) corruption_counter: BoundCounter<u64>,
+}
+
+impl BlockManagerMetrics {
+	pub fn new(resync_queue: SledCountedTree, resync_errors: SledCountedTree) -> Self {
+		let meter = global::meter("garage_model/block");
+		Self {
+			_resync_queue_len: meter
+				.u64_value_observer("block.resync_queue_length", move |observer| {
+					observer.observe(resync_queue.len() as u64, &[])
+				})
+				.with_description(
+					"Number of block hashes queued for local check and possible resync",
+				)
+				.init(),
+			_resync_errored_blocks: meter
+				.u64_value_observer("block.resync_errored_blocks", move |observer| {
+					observer.observe(resync_errors.len() as u64, &[])
+				})
+				.with_description("Number of block hashes whose last resync resulted in an error")
+				.init(),
+
+			resync_counter: meter
+				.u64_counter("block.resync_counter")
+				.with_description("Number of calls to resync_block")
+				.init()
+				.bind(&[]),
+			resync_error_counter: meter
+				.u64_counter("block.resync_error_counter")
+				.with_description("Number of calls to resync_block that returned an error")
+				.init()
+				.bind(&[]),
+			resync_duration: meter
+				.f64_value_recorder("block.resync_duration")
+				.with_description("Duration of resync_block operations")
+				.init()
+				.bind(&[]),
+			resync_send_counter: meter
+				.u64_counter("block.resync_send_counter")
+				.with_description("Number of blocks sent to another node in resync operations")
+				.init(),
+			resync_recv_counter: meter
+				.u64_counter("block.resync_recv_counter")
+				.with_description("Number of blocks received from other nodes in resync operations")
+				.init()
+				.bind(&[]),
+
+			bytes_read: meter
+				.u64_counter("block.bytes_read")
+				.with_description("Number of bytes read from disk")
+				.init()
+				.bind(&[]),
+			block_read_duration: meter
+				.f64_value_recorder("block.read_duration")
+				.with_description("Duration of block read operations")
+				.init()
+				.bind(&[]),
+			bytes_written: meter
+				.u64_counter("block.bytes_written")
+				.with_description("Number of bytes written to disk")
+				.init()
+				.bind(&[]),
+			block_write_duration: meter
+				.f64_value_recorder("block.write_duration")
+				.with_description("Duration of block write operations")
+				.init()
+				.bind(&[]),
+			delete_counter: meter
+				.u64_counter("block.delete_counter")
+				.with_description("Number of blocks deleted")
+				.init()
+				.bind(&[]),
+
+			corruption_counter: meter
+				.u64_counter("block.corruption_counter")
+				.with_description("Data corruptions detected on block reads")
+				.init()
+				.bind(&[]),
+		}
+	}
+}
--- a/src/block/rc.rs
+++ b/src/block/rc.rs
@ -0,0 +1,157 @@
+use std::convert::TryInto;
+
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::time::*;
+
+use crate::manager::BLOCK_GC_DELAY;
+
+pub struct BlockRc {
+	pub(crate) rc: sled::Tree,
+}
+
+impl BlockRc {
+	pub(crate) fn new(rc: sled::Tree) -> Self {
+		Self { rc }
+	}
+
+	/// Increment the reference counter associated to a hash.
+	/// Returns true if the RC goes from zero to nonzero.
+	pub(crate) fn block_incref(&self, hash: &Hash) -> Result<bool, Error> {
+		let old_rc = self
+			.rc
+			.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
+		let old_rc = RcEntry::parse_opt(old_rc);
+		Ok(old_rc.is_zero())
+	}
+
+	/// Decrement the reference counter associated to a hash.
+	/// Returns true if the RC is now zero.
+	pub(crate) fn block_decref(&self, hash: &Hash) -> Result<bool, Error> {
+		let new_rc = self
+			.rc
+			.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
+		let new_rc = RcEntry::parse_opt(new_rc);
+		Ok(matches!(new_rc, RcEntry::Deletable { .. }))
+	}
+
+	/// Read a block's reference count
+	pub(crate) fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
+		Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
+	}
+
+	/// Delete an entry in the RC table if it is deletable and the
+	/// deletion time has passed
+	pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
+		let now = now_msec();
+		self.rc.update_and_fetch(&hash, |rcval| {
+			let updated = match RcEntry::parse_opt(rcval) {
+				RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
+				v => v,
+			};
+			updated.serialize()
+		})?;
+		Ok(())
+	}
+}
+
+/// Describes the state of the reference counter for a block
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum RcEntry {
+	/// Present: the block has `count` references, with `count` > 0.
+	///
+	/// This is stored as u64::to_be_bytes(count)
+	Present { count: u64 },
+
+	/// Deletable: the block has zero references, and can be deleted
+	/// once time (returned by now_msec) is larger than at_time
+	/// (in millis since Unix epoch)
+	///
+	/// This is stored as [0u8; 8] followed by u64::to_be_bytes(at_time),
+	/// (this allows for the data format to be backwards compatible with
+	/// previous Garage versions that didn't have this intermediate state)
+	Deletable { at_time: u64 },
+
+	/// Absent: the block has zero references, and can be deleted
+	/// immediately
+	Absent,
+}
+
+impl RcEntry {
+	fn parse(bytes: &[u8]) -> Self {
+		if bytes.len() == 8 {
+			RcEntry::Present {
+				count: u64::from_be_bytes(bytes.try_into().unwrap()),
+			}
+		} else if bytes.len() == 16 {
+			RcEntry::Deletable {
+				at_time: u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
+			}
+		} else {
+			panic!("Invalid RC entry: {:?}, database is corrupted. This is an error Garage is currently unable to recover from. Sorry, and also please report a bug.",
+				bytes
+			)
+		}
+	}
+
+	fn parse_opt<V: AsRef<[u8]>>(bytes: Option<V>) -> Self {
+		bytes
+			.map(|b| Self::parse(b.as_ref()))
+			.unwrap_or(Self::Absent)
+	}
+
+	fn serialize(self) -> Option<Vec<u8>> {
+		match self {
+			RcEntry::Present { count } => Some(u64::to_be_bytes(count).to_vec()),
+			RcEntry::Deletable { at_time } => {
+				Some([u64::to_be_bytes(0), u64::to_be_bytes(at_time)].concat())
+			}
+			RcEntry::Absent => None,
+		}
+	}
+
+	fn increment(self) -> Self {
+		let old_count = match self {
+			RcEntry::Present { count } => count,
+			_ => 0,
+		};
+		RcEntry::Present {
+			count: old_count + 1,
+		}
+	}
+
+	fn decrement(self) -> Self {
+		match self {
+			RcEntry::Present { count } => {
+				if count > 1 {
+					RcEntry::Present { count: count - 1 }
+				} else {
+					RcEntry::Deletable {
+						at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
+					}
+				}
+			}
+			del => del,
+		}
+	}
+
+	pub(crate) fn is_zero(&self) -> bool {
+		matches!(self, RcEntry::Deletable { .. } | RcEntry::Absent)
+	}
+
+	pub(crate) fn is_nonzero(&self) -> bool {
+		!self.is_zero()
+	}
+
+	pub(crate) fn is_deletable(&self) -> bool {
+		match self {
+			RcEntry::Present { .. } => false,
+			RcEntry::Deletable { at_time } => now_msec() > *at_time,
+			RcEntry::Absent => true,
+		}
+	}
+
+	pub(crate) fn is_needed(&self) -> bool {
+		!self.is_deletable()
+	}
+}
--- a/src/garage/Cargo.toml
+++ b/src/garage/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -21,17 +21,18 @@ path = "tests/lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-garage_api = { version = "0.6.0", path = "../api" }
-garage_model = { version = "0.6.0", path = "../model" }
-garage_rpc = { version = "0.6.0", path = "../rpc" }
-garage_table = { version = "0.6.0", path = "../table" }
-garage_util = { version = "0.6.0", path = "../util" }
-garage_web = { version = "0.6.0", path = "../web" }
+garage_api = { version = "0.7.0", path = "../api" }
+garage_model = { version = "0.7.0", path = "../model" }
+garage_rpc = { version = "0.7.0", path = "../rpc" }
+garage_table = { version = "0.7.0", path = "../table" }
+garage_util = { version = "0.7.0", path = "../util" }
+garage_web = { version = "0.7.0", path = "../web" }
+garage_admin = { version = "0.7.0", path = "../admin" }

 bytes = "1.0"
 git-version = "0.3.4"
 hex = "0.4"
-log = "0.4"
+tracing = { version = "0.1.30", features = ["log-always"] }
 pretty_env_logger = "0.4"
 rand = "0.8"
 async-trait = "0.1.7"
@ -49,11 +50,16 @@ futures = "0.3"
 futures-util = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-netapp = "0.3.0"
+#netapp = { version = "0.3.2", git = "https://git.deuxfleurs.fr/lx/netapp" }
+#netapp = { version = "0.4", path = "../../../netapp" }
+netapp = "0.4"

 [dev-dependencies]
-aws-sdk-s3 = "0.6"
+aws-sdk-s3 = "0.8"
+chrono = "0.4"
 http = "0.2"
+hmac = "0.10"
+hyper = { version = "0.14", features = ["client", "http1", "runtime"] }
+sha2 = "0.9"

 static_init = "1.0"
--- a/src/garage/admin.rs
+++ b/src/garage/admin.rs
@ -728,6 +728,12 @@ impl AdminRpcHandler {
 			self.garage.block_manager.resync_queue_len()
 		)
 		.unwrap();
+		writeln!(
+			&mut ret,
+			"  blocks with resync errors: {}",
+			self.garage.block_manager.resync_errors_len()
+		)
+		.unwrap();

 		ret
 	}
--- a/src/garage/cli/init.rs
+++ b/src/garage/cli/init.rs
@ -1,7 +1,5 @@
 use std::path::PathBuf;

-use log::warn;
-
 use garage_util::error::*;

 pub const READ_KEY_ERROR: &str = "Unable to read node key. It will be generated by your garage node the first time is it launched. Ensure that your garage node is currently running. (The node key is supposed to be stored in your metadata directory.)";
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@ -43,7 +43,11 @@ pub async fn cmd_assign_role(
 		resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
 	};

-	let added_node = find_matching_node(status.iter().map(|adv| adv.id), &args.node_id)?;
+	let added_nodes = args
+		.node_ids
+		.iter()
+		.map(|node_id| find_matching_node(status.iter().map(|adv| adv.id), node_id))
+		.collect::<Result<Vec<_>, _>>()?;

 	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;

@ -75,6 +79,7 @@ pub async fn cmd_assign_role(
 		return Err(Error::Message("Invalid capacity value: 0".into()));
 	}

+	for added_node in added_nodes {
 		let new_entry = match roles.get(&added_node) {
 			Some(NodeRoleV(Some(old))) => {
 				let capacity = match args.capacity {
@ -85,10 +90,10 @@ pub async fn cmd_assign_role(
 				let tags = if args.tags.is_empty() {
 					old.tags.clone()
 				} else {
-				args.tags
+					args.tags.clone()
 				};
 				NodeRole {
-				zone: args.zone.unwrap_or_else(|| old.zone.to_string()),
+					zone: args.zone.clone().unwrap_or_else(|| old.zone.to_string()),
 					capacity,
 					tags,
 				}
@ -101,9 +106,12 @@ pub async fn cmd_assign_role(
 							"Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())),
 				};
 				NodeRole {
-				zone: args.zone.ok_or("Please specifiy a zone with the -z flag")?,
+					zone: args
+						.zone
+						.clone()
+						.ok_or("Please specifiy a zone with the -z flag")?,
 					capacity,
-				tags: args.tags,
+					tags: args.tags.clone(),
 				}
 			}
 		};
@ -111,10 +119,11 @@ pub async fn cmd_assign_role(
 		layout
 			.staging
 			.merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry))));
+	}

 	send_layout(rpc_cli, rpc_host, layout).await?;

-	println!("Role change is staged but not yet commited.");
+	println!("Role changes are staged but not yet commited.");
 	println!("Use `garage layout show` to view staged role changes,");
 	println!("and `garage layout apply` to enact staged changes.");
 	Ok(())
--- a/src/garage/cli/structs.rs
+++ b/src/garage/cli/structs.rs
@ -92,8 +92,9 @@ pub enum LayoutOperation {

 #[derive(StructOpt, Debug)]
 pub struct AssignRoleOpt {
-	/// Node to which to assign role (prefix of hexadecimal node id)
-	pub(crate) node_id: String,
+	/// Node(s) to which to assign role (prefix of hexadecimal node id)
+	#[structopt(required = true)]
+	pub(crate) node_ids: Vec<String>,

 	/// Location (zone or datacenter) of the node
 	#[structopt(short = "z", long = "zone")]
--- a/src/garage/main.rs
+++ b/src/garage/main.rs
@ -2,7 +2,7 @@
 //! Garage CLI, used to interact with a running Garage instance, and to launch a Garage instance

 #[macro_use]
-extern crate log;
+extern crate tracing;

 mod admin;
 mod cli;
@ -55,7 +55,7 @@ struct Opt {
 #[tokio::main]
 async fn main() {
 	if std::env::var("RUST_LOG").is_err() {
-		std::env::set_var("RUST_LOG", "garage=info")
+		std::env::set_var("RUST_LOG", "netapp=info,garage=info")
 	}
 	pretty_env_logger::init();
 	sodiumoxide::init().expect("Unable to init sodiumoxide");
@ -106,7 +106,7 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
 	// Generate a temporary keypair for our RPC client
 	let (_pk, sk) = sodiumoxide::crypto::sign::ed25519::gen_keypair();

-	let netapp = NetApp::new(network_key, sk);
+	let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, sk);

 	// Find and parse the address of the target host
 	let (id, addr) = if let Some(h) = opt.rpc_host {
@ -115,7 +115,7 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
 	} else {
 		let node_id = garage_rpc::system::read_node_id(&config.as_ref().unwrap().metadata_dir)
 			.err_context(READ_KEY_ERROR)?;
-		if let Some(a) = config.as_ref().map(|c| c.rpc_public_addr).flatten() {
+		if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr) {
 			(node_id, a)
 		} else {
 			let default_addr = SocketAddr::new(
--- a/src/garage/server.rs
+++ b/src/garage/server.rs
@ -6,6 +6,8 @@ use garage_util::background::*;
 use garage_util::config::*;
 use garage_util::error::Error;

+use garage_admin::metrics::*;
+use garage_admin::tracing_setup::*;
 use garage_api::run_api_server;
 use garage_model::garage::Garage;
 use garage_web::run_web_server;
@ -34,6 +36,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 		.open()
 		.expect("Unable to open sled DB");

+	info!("Initialize admin web server and metric backend...");
+	let admin_server_init = AdminServer::init();
+
 	info!("Initializing background runner...");
 	let watch_cancel = netapp::util::watch_ctrl_c();
 	let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
@ -41,9 +46,14 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 	info!("Initializing Garage main data store...");
 	let garage = Garage::new(config.clone(), db, background);

+	info!("Initialize tracing...");
+	if let Some(export_to) = config.admin.trace_sink {
+		init_tracing(&export_to, garage.system.id)?;
+	}
+
 	let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));

-	info!("Crate admin RPC handler...");
+	info!("Create admin RPC handler...");
 	AdminRpcHandler::new(garage.clone());

 	info!("Initializing API server...");
@ -58,6 +68,15 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 		wait_from(watch_cancel.clone()),
 	));

+	let admin_server = if let Some(admin_bind_addr) = config.admin.api_bind_addr {
+		info!("Configure and run admin web server...");
+		Some(tokio::spawn(
+			admin_server_init.run(admin_bind_addr, wait_from(watch_cancel.clone())),
+		))
+	} else {
+		None
+	};
+
 	// Stuff runs

 	// When a cancel signal is sent, stuff stops
@ -67,6 +86,11 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 	if let Err(e) = web_server.await? {
 		warn!("Web server exited with error: {}", e);
 	}
+	if let Some(a) = admin_server {
+		if let Err(e) = a.await? {
+			warn!("Admin web server exited with error: {}", e);
+		}
+	}

 	// Remove RPC handlers for system to break reference cycles
 	garage.system.netapp.drop_all_handlers();
@ -74,8 +98,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 	// Await for netapp RPC system to end
 	run_system.await?;

-	// Break last reference cycles so that stuff can terminate properly
-	garage.break_reference_cycles();
+	// Drop all references so that stuff can terminate properly
 	drop(garage);

 	// Await for all background tasks to end
--- a/src/garage/tests/admin.rs
+++ b/src/garage/tests/admin.rs
@ -0,0 +1,74 @@
+use crate::common;
+use crate::common::ext::*;
+
+const BCKT_NAME: &str = "seau";
+
+#[tokio::test]
+async fn test_admin_bucket_perms() {
+	let ctx = common::context();
+
+	let hb = || ctx.client.head_bucket().bucket(BCKT_NAME).send();
+
+	assert!(hb().await.is_err());
+
+	ctx.garage
+		.command()
+		.args(["bucket", "create", BCKT_NAME])
+		.quiet()
+		.expect_success_status("Could not create bucket");
+
+	assert!(hb().await.is_err());
+
+	ctx.garage
+		.command()
+		.args([
+			"bucket",
+			"allow",
+			"--read",
+			"--key",
+			&ctx.garage.key.id,
+			BCKT_NAME,
+		])
+		.quiet()
+		.expect_success_status("Could not create bucket");
+
+	assert!(hb().await.is_ok());
+
+	ctx.garage
+		.command()
+		.args([
+			"bucket",
+			"deny",
+			"--read",
+			"--key",
+			&ctx.garage.key.name,
+			BCKT_NAME,
+		])
+		.quiet()
+		.expect_success_status("Could not create bucket");
+
+	assert!(hb().await.is_err());
+
+	ctx.garage
+		.command()
+		.args([
+			"bucket",
+			"allow",
+			"--read",
+			"--key",
+			&ctx.garage.key.name,
+			BCKT_NAME,
+		])
+		.quiet()
+		.expect_success_status("Could not create bucket");
+
+	assert!(hb().await.is_ok());
+
+	ctx.garage
+		.command()
+		.args(["bucket", "delete", "--yes", BCKT_NAME])
+		.quiet()
+		.expect_success_status("Could not delete bucket");
+
+	assert!(hb().await.is_err());
+}
--- a/src/garage/tests/bucket.rs
+++ b/src/garage/tests/bucket.rs
@ -0,0 +1,87 @@
+use crate::common;
+use aws_sdk_s3::model::BucketLocationConstraint;
+use aws_sdk_s3::output::DeleteBucketOutput;
+
+#[tokio::test]
+async fn test_bucket_all() {
+	let ctx = common::context();
+	let bucket_name = "hello";
+
+	{
+		// Create bucket
+		//@TODO check with an invalid bucket name + with an already existing bucket
+		let r = ctx
+			.client
+			.create_bucket()
+			.bucket(bucket_name)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.location.unwrap(), "/hello");
+	}
+	{
+		// List buckets
+		let r = ctx.client.list_buckets().send().await.unwrap();
+		assert!(r
+			.buckets
+			.as_ref()
+			.unwrap()
+			.iter()
+			.filter(|x| x.name.as_ref().is_some())
+			.find(|x| x.name.as_ref().unwrap() == "hello")
+			.is_some());
+	}
+	{
+		// Get its location
+		let r = ctx
+			.client
+			.get_bucket_location()
+			.bucket(bucket_name)
+			.send()
+			.await
+			.unwrap();
+
+		match r.location_constraint.unwrap() {
+			BucketLocationConstraint::Unknown(v) if v.as_str() == "garage-integ-test" => (),
+			_ => unreachable!("wrong region"),
+		}
+	}
+	{
+		// (Stub) check GetVersioning
+		let r = ctx
+			.client
+			.get_bucket_versioning()
+			.bucket(bucket_name)
+			.send()
+			.await
+			.unwrap();
+
+		assert!(r.status.is_none());
+	}
+	{
+		// Delete bucket
+		// @TODO add a check with a non-empty bucket and check failure
+		let r = ctx
+			.client
+			.delete_bucket()
+			.bucket(bucket_name)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r, DeleteBucketOutput::builder().build());
+	}
+	{
+		// Check bucket is deleted with List buckets
+		let r = ctx.client.list_buckets().send().await.unwrap();
+		assert!(r
+			.buckets
+			.as_ref()
+			.unwrap()
+			.iter()
+			.filter(|x| x.name.as_ref().is_some())
+			.find(|x| x.name.as_ref().unwrap() == "hello")
+			.is_none());
+	}
+}
--- a/src/garage/tests/common/custom_requester.rs
+++ b/src/garage/tests/common/custom_requester.rs
@ -0,0 +1,265 @@
+#![allow(dead_code)]
+
+use std::collections::HashMap;
+use std::convert::TryFrom;
+
+use chrono::{offset::Utc, DateTime};
+use hmac::{Hmac, Mac};
+use hyper::client::HttpConnector;
+use hyper::{Body, Client, Method, Request, Response, Uri};
+
+use super::garage::{Instance, Key};
+use garage_api::signature;
+
+/// You should ever only use this to send requests AWS sdk won't send,
+/// like to reproduce behavior of unusual implementations found to be
+/// problematic.
+pub struct CustomRequester {
+	key: Key,
+	uri: Uri,
+	client: Client<HttpConnector>,
+}
+
+impl CustomRequester {
+	pub fn new(instance: &Instance) -> Self {
+		CustomRequester {
+			key: instance.key.clone(),
+			uri: instance.uri(),
+			client: Client::new(),
+		}
+	}
+
+	pub fn builder(&self, bucket: String) -> RequestBuilder<'_> {
+		RequestBuilder {
+			requester: self,
+			bucket,
+			method: Method::GET,
+			path: String::new(),
+			query_params: HashMap::new(),
+			signed_headers: HashMap::new(),
+			unsigned_headers: HashMap::new(),
+			body: Vec::new(),
+			body_signature: BodySignature::Classic,
+			vhost_style: false,
+		}
+	}
+}
+
+pub struct RequestBuilder<'a> {
+	requester: &'a CustomRequester,
+	bucket: String,
+	method: Method,
+	path: String,
+	query_params: HashMap<String, Option<String>>,
+	signed_headers: HashMap<String, String>,
+	unsigned_headers: HashMap<String, String>,
+	body: Vec<u8>,
+	body_signature: BodySignature,
+	vhost_style: bool,
+}
+
+impl<'a> RequestBuilder<'a> {
+	pub fn method(&mut self, method: Method) -> &mut Self {
+		self.method = method;
+		self
+	}
+
+	pub fn path(&mut self, path: String) -> &mut Self {
+		self.path = path;
+		self
+	}
+
+	pub fn query_params(&mut self, query_params: HashMap<String, Option<String>>) -> &mut Self {
+		self.query_params = query_params;
+		self
+	}
+
+	pub fn signed_headers(&mut self, signed_headers: HashMap<String, String>) -> &mut Self {
+		self.signed_headers = signed_headers;
+		self
+	}
+
+	pub fn unsigned_headers(&mut self, unsigned_headers: HashMap<String, String>) -> &mut Self {
+		self.unsigned_headers = unsigned_headers;
+		self
+	}
+
+	pub fn body(&mut self, body: Vec<u8>) -> &mut Self {
+		self.body = body;
+		self
+	}
+
+	pub fn body_signature(&mut self, body_signature: BodySignature) -> &mut Self {
+		self.body_signature = body_signature;
+		self
+	}
+
+	pub fn vhost_style(&mut self, vhost_style: bool) -> &mut Self {
+		self.vhost_style = vhost_style;
+		self
+	}
+
+	pub async fn send(&mut self) -> hyper::Result<Response<Body>> {
+		// TODO this is a bit incorrect in that path and query params should be url-encoded and
+		// aren't, but this is good enought for now.
+
+		let query = query_param_to_string(&self.query_params);
+		let (host, path) = if self.vhost_style {
+			(
+				format!("{}.s3.garage", self.bucket),
+				format!("{}{}", self.path, query),
+			)
+		} else {
+			(
+				"s3.garage".to_owned(),
+				format!("{}/{}{}", self.bucket, self.path, query),
+			)
+		};
+		let uri = format!("{}{}", self.requester.uri, path);
+
+		let now = Utc::now();
+		let scope = signature::compute_scope(&now, super::REGION.as_ref());
+		let mut signer = signature::signing_hmac(
+			&now,
+			&self.requester.key.secret,
+			super::REGION.as_ref(),
+			"s3",
+		)
+		.unwrap();
+		let streaming_signer = signer.clone();
+
+		let mut all_headers = self.signed_headers.clone();
+
+		let date = now.format(signature::LONG_DATETIME).to_string();
+		all_headers.insert("x-amz-date".to_owned(), date);
+		all_headers.insert("host".to_owned(), host);
+
+		let body_sha = match self.body_signature {
+			BodySignature::Unsigned => "UNSIGNED-PAYLOAD".to_owned(),
+			BodySignature::Classic => hex::encode(garage_util::data::sha256sum(&self.body)),
+			BodySignature::Streaming(size) => {
+				all_headers.insert("content-encoding".to_owned(), "aws-chunked".to_owned());
+				all_headers.insert(
+					"x-amz-decoded-content-length".to_owned(),
+					self.body.len().to_string(),
+				);
+				// Get lenght of body by doing the conversion to a streaming body with an
+				// invalid signature (we don't know the seed) just to get its length. This
+				// is a pretty lazy and inefficient way to do it, but it's enought for test
+				// code.
+				all_headers.insert(
+					"content-length".to_owned(),
+					to_streaming_body(&self.body, size, String::new(), signer.clone(), now, "")
+						.len()
+						.to_string(),
+				);
+
+				"STREAMING-AWS4-HMAC-SHA256-PAYLOAD".to_owned()
+			}
+		};
+		all_headers.insert("x-amz-content-sha256".to_owned(), body_sha.clone());
+
+		let mut signed_headers = all_headers
+			.iter()
+			.map(|(k, _)| k.as_ref())
+			.collect::<Vec<&str>>();
+		signed_headers.sort();
+		let signed_headers = signed_headers.join(";");
+
+		all_headers.extend(self.unsigned_headers.clone());
+
+		let canonical_request = signature::payload::canonical_request(
+			&self.method,
+			&Uri::try_from(&uri).unwrap(),
+			&all_headers,
+			&signed_headers,
+			&body_sha,
+		);
+
+		let string_to_sign = signature::payload::string_to_sign(&now, &scope, &canonical_request);
+
+		signer.update(string_to_sign.as_bytes());
+		let signature = hex::encode(signer.finalize().into_bytes());
+		let authorization = format!(
+			"AWS4-HMAC-SHA256 Credential={}/{},SignedHeaders={},Signature={}",
+			self.requester.key.id, scope, signed_headers, signature
+		);
+		all_headers.insert("authorization".to_owned(), authorization);
+
+		let mut request = Request::builder();
+		for (k, v) in all_headers {
+			request = request.header(k, v);
+		}
+
+		let body = if let BodySignature::Streaming(size) = self.body_signature {
+			to_streaming_body(&self.body, size, signature, streaming_signer, now, &scope)
+		} else {
+			self.body.clone()
+		};
+		let request = request
+			.uri(uri)
+			.method(self.method.clone())
+			.body(Body::from(body))
+			.unwrap();
+		self.requester.client.request(request).await
+	}
+}
+
+pub enum BodySignature {
+	Unsigned,
+	Classic,
+	Streaming(usize),
+}
+
+fn query_param_to_string(params: &HashMap<String, Option<String>>) -> String {
+	if params.is_empty() {
+		return String::new();
+	}
+
+	"?".to_owned()
+		+ &params
+			.iter()
+			.map(|(k, v)| {
+				if let Some(v) = v {
+					format!("{}={}", k, v)
+				} else {
+					k.clone()
+				}
+			})
+			.collect::<Vec<String>>()
+			.join("&")
+}
+
+fn to_streaming_body(
+	body: &[u8],
+	chunk_size: usize,
+	mut seed: String,
+	hasher: Hmac<sha2::Sha256>,
+	now: DateTime<Utc>,
+	scope: &str,
+) -> Vec<u8> {
+	const SHA_NULL: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
+	let now = now.format(signature::LONG_DATETIME).to_string();
+	let mut res = Vec::with_capacity(body.len());
+	for chunk in body.chunks(chunk_size).chain(std::iter::once(&[][..])) {
+		let to_sign = format!(
+			"AWS4-HMAC-SHA256-PAYLOAD\n{}\n{}\n{}\n{}\n{}",
+			now,
+			scope,
+			seed,
+			SHA_NULL,
+			hex::encode(garage_util::data::sha256sum(chunk))
+		);
+
+		let mut hasher = hasher.clone();
+		hasher.update(to_sign.as_bytes());
+		seed = hex::encode(hasher.finalize().into_bytes());
+
+		let header = format!("{:x};chunk-signature={}\r\n", chunk.len(), seed);
+		res.extend_from_slice(header.as_bytes());
+		res.extend_from_slice(chunk);
+		res.extend_from_slice(b"\r\n");
+	}
+
+	res
+}
--- a/src/garage/tests/common/garage.rs
+++ b/src/garage/tests/common/garage.rs
@ -6,12 +6,12 @@ use std::sync::Once;
 use super::ext::*;

 // https://xkcd.com/221/
-const DEFAULT_PORT: u16 = 49995;
+pub const DEFAULT_PORT: u16 = 49995;

 static GARAGE_TEST_SECRET: &str =
 	"c3ea8cb80333d04e208d136698b1a01ae370d463f0d435ab2177510b3478bf44";

-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub struct Key {
 	pub name: String,
 	pub id: String,
@ -65,6 +65,9 @@ root_domain = ".s3.garage"
 bind_addr = "127.0.0.1:{web_port}"
 root_domain = ".web.garage"
 index = "index.html"
+
+[admin]
+api_bind_addr = "127.0.0.1:{admin_port}"
 "#,
 			path = path.display(),
 			secret = GARAGE_TEST_SECRET,
@ -72,6 +75,7 @@ index = "index.html"
 			api_port = port,
 			rpc_port = port + 1,
 			web_port = port + 2,
+			admin_port = port + 3,
 		);
 		fs::write(path.join("config.toml"), config).expect("Could not write garage config file");

--- a/src/garage/tests/common/mod.rs
+++ b/src/garage/tests/common/mod.rs
@ -5,22 +5,31 @@ use ext::*;
 pub mod macros;

 pub mod client;
+pub mod custom_requester;
 pub mod ext;
 pub mod garage;

+use custom_requester::CustomRequester;
+
 const REGION: Region = Region::from_static("garage-integ-test");

 pub struct Context {
 	pub garage: &'static garage::Instance,
 	pub client: Client,
+	pub custom_request: CustomRequester,
 }

 impl Context {
 	fn new() -> Self {
 		let garage = garage::instance();
 		let client = client::build_client(garage);
+		let custom_request = CustomRequester::new(garage);

-		Context { garage, client }
+		Context {
+			garage,
+			client,
+			custom_request,
+		}
 	}

 	/// Create an unique bucket with a random suffix.
--- a/src/garage/tests/lib.rs
+++ b/src/garage/tests/lib.rs
@ -1,4 +1,11 @@
 #[macro_use]
 mod common;

+mod admin;
+mod bucket;
+mod list;
+mod multipart;
+mod objects;
 mod simple;
+mod streaming_signature;
+mod website;
--- a/src/garage/tests/list.rs
+++ b/src/garage/tests/list.rs
@ -0,0 +1,615 @@
+use crate::common;
+
+const KEYS: [&str; 8] = ["a", "a/a", "a/b", "a/c", "a/d/a", "a/é", "b", "c"];
+const KEYS_MULTIPART: [&str; 5] = ["a", "a", "c", "c/a", "c/b"];
+
+#[tokio::test]
+async fn test_listobjectsv2() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("listobjectsv2");
+
+	for k in KEYS {
+		ctx.client
+			.put_object()
+			.bucket(&bucket)
+			.key(k)
+			.send()
+			.await
+			.unwrap();
+	}
+
+	{
+		// Scoping the variable to avoid reusing it
+		// in a following assert due to copy paste
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 8);
+		assert!(r.common_prefixes.is_none());
+	}
+
+	//@FIXME aws-sdk-s3 automatically checks max-key values.
+	// If we set it to zero, it drops it, and it is probably
+	// the same behavior on values bigger than 1000.
+	// Boto and awscli do not perform these tests, we should write
+	// our own minimal library to bypass AWS SDK's tests and be
+	// sure that we behave correctly.
+
+	{
+		// With 2 elements
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.max_keys(2)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 2);
+		assert!(r.common_prefixes.is_none());
+		assert!(r.next_continuation_token.is_some());
+	}
+
+	{
+		// With pagination
+		let mut cnt = 0;
+		let mut next = None;
+		let last_idx = KEYS.len() - 1;
+
+		for i in 0..KEYS.len() {
+			let r = ctx
+				.client
+				.list_objects_v2()
+				.bucket(&bucket)
+				.set_continuation_token(next)
+				.max_keys(1)
+				.send()
+				.await
+				.unwrap();
+
+			cnt += 1;
+			next = r.next_continuation_token;
+
+			assert_eq!(r.contents.unwrap().len(), 1);
+			assert!(r.common_prefixes.is_none());
+			if i != last_idx {
+				assert!(next.is_some());
+			}
+		}
+		assert_eq!(cnt, KEYS.len());
+	}
+
+	{
+		// With a delimiter
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.delimiter("/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 3);
+		assert_eq!(r.common_prefixes.unwrap().len(), 1);
+	}
+
+	{
+		// With a delimiter and pagination
+		let mut cnt_pfx = 0;
+		let mut cnt_key = 0;
+		let mut next = None;
+
+		for _i in 0..KEYS.len() {
+			let r = ctx
+				.client
+				.list_objects_v2()
+				.bucket(&bucket)
+				.set_continuation_token(next)
+				.delimiter("/")
+				.max_keys(1)
+				.send()
+				.await
+				.unwrap();
+
+			next = r.next_continuation_token;
+			match (r.contents, r.common_prefixes) {
+				(Some(k), None) if k.len() == 1 => cnt_key += 1,
+				(None, Some(pfx)) if pfx.len() == 1 => cnt_pfx += 1,
+				_ => unreachable!("logic error"),
+			};
+			if next.is_none() {
+				break;
+			}
+		}
+		assert_eq!(cnt_key, 3);
+		assert_eq!(cnt_pfx, 1);
+	}
+
+	{
+		// With a prefix
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.prefix("a/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 5);
+		assert!(r.common_prefixes.is_none());
+	}
+
+	{
+		// With a prefix and a delimiter
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.prefix("a/")
+			.delimiter("/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 4);
+		assert_eq!(r.common_prefixes.unwrap().len(), 1);
+	}
+
+	{
+		// With a prefix, a delimiter and max_key
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.prefix("a/")
+			.delimiter("/")
+			.max_keys(1)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.as_ref().unwrap().len(), 1);
+		assert_eq!(
+			r.contents
+				.unwrap()
+				.first()
+				.unwrap()
+				.key
+				.as_ref()
+				.unwrap()
+				.as_str(),
+			"a/a"
+		);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With start_after before all keys
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.start_after("Z")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 8);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With start_after after all keys
+		let r = ctx
+			.client
+			.list_objects_v2()
+			.bucket(&bucket)
+			.start_after("c")
+			.send()
+			.await
+			.unwrap();
+
+		assert!(r.contents.is_none());
+		assert!(r.common_prefixes.is_none());
+	}
+}
+
+#[tokio::test]
+async fn test_listobjectsv1() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("listobjects");
+
+	for k in KEYS {
+		ctx.client
+			.put_object()
+			.bucket(&bucket)
+			.key(k)
+			.send()
+			.await
+			.unwrap();
+	}
+
+	{
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 8);
+		assert!(r.common_prefixes.is_none());
+	}
+
+	{
+		// With 2 elements
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.max_keys(2)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 2);
+		assert!(r.common_prefixes.is_none());
+		assert!(r.next_marker.is_some());
+	}
+
+	{
+		// With pagination
+		let mut cnt = 0;
+		let mut next = None;
+		let last_idx = KEYS.len() - 1;
+
+		for i in 0..KEYS.len() {
+			let r = ctx
+				.client
+				.list_objects()
+				.bucket(&bucket)
+				.set_marker(next)
+				.max_keys(1)
+				.send()
+				.await
+				.unwrap();
+
+			cnt += 1;
+			next = r.next_marker;
+
+			assert_eq!(r.contents.unwrap().len(), 1);
+			assert!(r.common_prefixes.is_none());
+			if i != last_idx {
+				assert!(next.is_some());
+			}
+		}
+		assert_eq!(cnt, KEYS.len());
+	}
+
+	{
+		// With a delimiter
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.delimiter("/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 3);
+		assert_eq!(r.common_prefixes.unwrap().len(), 1);
+	}
+
+	{
+		// With a delimiter and pagination
+		let mut cnt_pfx = 0;
+		let mut cnt_key = 0;
+		let mut next = None;
+
+		for _i in 0..KEYS.len() {
+			let r = ctx
+				.client
+				.list_objects()
+				.bucket(&bucket)
+				.delimiter("/")
+				.set_marker(next)
+				.max_keys(1)
+				.send()
+				.await
+				.unwrap();
+
+			next = r.next_marker;
+			match (r.contents, r.common_prefixes) {
+				(Some(k), None) if k.len() == 1 => cnt_key += 1,
+				(None, Some(pfx)) if pfx.len() == 1 => cnt_pfx += 1,
+				_ => unreachable!("logic error"),
+			};
+			if next.is_none() {
+				break;
+			}
+		}
+		assert_eq!(cnt_key, 3);
+		// We have no optimization to skip the whole prefix
+		// on listobjectsv1 so we return the same one 5 times,
+		// for each element. It is up to the client to merge its result.
+		// This is compliant with AWS spec.
+		assert_eq!(cnt_pfx, 5);
+	}
+
+	{
+		// With a prefix
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.prefix("a/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 5);
+		assert!(r.common_prefixes.is_none());
+	}
+
+	{
+		// With a prefix and a delimiter
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.prefix("a/")
+			.delimiter("/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 4);
+		assert_eq!(r.common_prefixes.unwrap().len(), 1);
+	}
+
+	{
+		// With a prefix, a delimiter and max_key
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.prefix("a/")
+			.delimiter("/")
+			.max_keys(1)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.as_ref().unwrap().len(), 1);
+		assert_eq!(
+			r.contents
+				.unwrap()
+				.first()
+				.unwrap()
+				.key
+				.as_ref()
+				.unwrap()
+				.as_str(),
+			"a/a"
+		);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With marker before all keys
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.marker("Z")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.contents.unwrap().len(), 8);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With start_after after all keys
+		let r = ctx
+			.client
+			.list_objects()
+			.bucket(&bucket)
+			.marker("c")
+			.send()
+			.await
+			.unwrap();
+
+		assert!(r.contents.is_none());
+		assert!(r.common_prefixes.is_none());
+	}
+}
+
+#[tokio::test]
+async fn test_listmultipart() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("listmultipartuploads");
+
+	for k in KEYS_MULTIPART {
+		ctx.client
+			.create_multipart_upload()
+			.bucket(&bucket)
+			.key(k)
+			.send()
+			.await
+			.unwrap();
+	}
+
+	{
+		// Default
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.uploads.unwrap().len(), 5);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With pagination
+		let mut next = None;
+		let mut upnext = None;
+		let last_idx = KEYS_MULTIPART.len() - 1;
+
+		for i in 0..KEYS_MULTIPART.len() {
+			let r = ctx
+				.client
+				.list_multipart_uploads()
+				.bucket(&bucket)
+				.set_key_marker(next)
+				.set_upload_id_marker(upnext)
+				.max_uploads(1)
+				.send()
+				.await
+				.unwrap();
+
+			next = r.next_key_marker;
+			upnext = r.next_upload_id_marker;
+
+			assert_eq!(r.uploads.unwrap().len(), 1);
+			assert!(r.common_prefixes.is_none());
+			if i != last_idx {
+				assert!(next.is_some());
+			}
+		}
+	}
+	{
+		// With delimiter
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.delimiter("/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.uploads.unwrap().len(), 3);
+		assert_eq!(r.common_prefixes.unwrap().len(), 1);
+	}
+	{
+		// With delimiter and pagination
+		let mut next = None;
+		let mut upnext = None;
+		let mut upcnt = 0;
+		let mut pfxcnt = 0;
+		let mut loopcnt = 0;
+
+		while loopcnt < KEYS_MULTIPART.len() {
+			let r = ctx
+				.client
+				.list_multipart_uploads()
+				.bucket(&bucket)
+				.delimiter("/")
+				.max_uploads(1)
+				.set_key_marker(next)
+				.set_upload_id_marker(upnext)
+				.send()
+				.await
+				.unwrap();
+
+			next = r.next_key_marker;
+			upnext = r.next_upload_id_marker;
+
+			loopcnt += 1;
+			upcnt += r.uploads.unwrap_or_default().len();
+			pfxcnt += r.common_prefixes.unwrap_or_default().len();
+
+			if next.is_none() {
+				break;
+			}
+		}
+
+		assert_eq!(upcnt + pfxcnt, loopcnt);
+		assert_eq!(upcnt, 3);
+		assert_eq!(pfxcnt, 1);
+	}
+	{
+		// With prefix
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.prefix("c")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.uploads.unwrap().len(), 3);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With prefix and delimiter
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.prefix("c")
+			.delimiter("/")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.uploads.unwrap().len(), 1);
+		assert_eq!(r.common_prefixes.unwrap().len(), 1);
+	}
+	{
+		// With prefix, delimiter and max keys
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.prefix("c")
+			.delimiter("/")
+			.max_uploads(1)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.uploads.unwrap().len(), 1);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With starting token before the first element
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.key_marker("ZZZZZ")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.uploads.unwrap().len(), 5);
+		assert!(r.common_prefixes.is_none());
+	}
+	{
+		// With starting token after the last element
+		let r = ctx
+			.client
+			.list_multipart_uploads()
+			.bucket(&bucket)
+			.key_marker("d")
+			.send()
+			.await
+			.unwrap();
+
+		assert!(r.uploads.is_none());
+		assert!(r.common_prefixes.is_none());
+	}
+}
--- a/src/garage/tests/multipart.rs
+++ b/src/garage/tests/multipart.rs
@ -0,0 +1,415 @@
+use crate::common;
+use aws_sdk_s3::model::{CompletedMultipartUpload, CompletedPart};
+use aws_sdk_s3::types::ByteStream;
+
+const SZ_5MB: usize = 5 * 1024 * 1024;
+const SZ_10MB: usize = 10 * 1024 * 1024;
+
+#[tokio::test]
+async fn test_uploadlistpart() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("uploadpart");
+
+	let u1 = vec![0xee; SZ_5MB];
+	let u2 = vec![0x11; SZ_5MB];
+
+	let up = ctx
+		.client
+		.create_multipart_upload()
+		.bucket(&bucket)
+		.key("a")
+		.send()
+		.await
+		.unwrap();
+	let uid = up.upload_id.as_ref().unwrap();
+
+	assert!(up.upload_id.is_some());
+
+	{
+		let r = ctx
+			.client
+			.list_parts()
+			.bucket(&bucket)
+			.key("a")
+			.upload_id(uid)
+			.send()
+			.await
+			.unwrap();
+
+		assert!(r.parts.is_none());
+	}
+
+	let p1 = ctx
+		.client
+		.upload_part()
+		.bucket(&bucket)
+		.key("a")
+		.upload_id(uid)
+		.part_number(2)
+		.body(ByteStream::from(u1))
+		.send()
+		.await
+		.unwrap();
+
+	{
+		// ListPart on 1st element
+		let r = ctx
+			.client
+			.list_parts()
+			.bucket(&bucket)
+			.key("a")
+			.upload_id(uid)
+			.send()
+			.await
+			.unwrap();
+
+		let ps = r.parts.unwrap();
+		assert_eq!(ps.len(), 1);
+		let fp = ps.iter().find(|x| x.part_number == 2).unwrap();
+		assert!(fp.last_modified.is_some());
+		assert_eq!(
+			fp.e_tag.as_ref().unwrap(),
+			"\"3366bb9dcf710d6801b5926467d02e19\""
+		);
+		assert_eq!(fp.size, SZ_5MB as i64);
+	}
+
+	let p2 = ctx
+		.client
+		.upload_part()
+		.bucket(&bucket)
+		.key("a")
+		.upload_id(uid)
+		.part_number(1)
+		.body(ByteStream::from(u2))
+		.send()
+		.await
+		.unwrap();
+
+	{
+		// ListPart on the 2 elements
+		let r = ctx
+			.client
+			.list_parts()
+			.bucket(&bucket)
+			.key("a")
+			.upload_id(uid)
+			.send()
+			.await
+			.unwrap();
+
+		let ps = r.parts.unwrap();
+		assert_eq!(ps.len(), 2);
+		let fp = ps.iter().find(|x| x.part_number == 1).unwrap();
+		assert!(fp.last_modified.is_some());
+		assert_eq!(
+			fp.e_tag.as_ref().unwrap(),
+			"\"3c484266f9315485694556e6c693bfa2\""
+		);
+		assert_eq!(fp.size, SZ_5MB as i64);
+	}
+
+	{
+		// Call pagination
+		let r = ctx
+			.client
+			.list_parts()
+			.bucket(&bucket)
+			.key("a")
+			.upload_id(uid)
+			.max_parts(1)
+			.send()
+			.await
+			.unwrap();
+
+		assert!(r.part_number_marker.is_none());
+		assert!(r.next_part_number_marker.is_some());
+		assert_eq!(r.max_parts, 1_i32);
+		assert!(r.is_truncated);
+		assert_eq!(r.key.unwrap(), "a");
+		assert_eq!(r.upload_id.unwrap().as_str(), uid.as_str());
+		assert_eq!(r.parts.unwrap().len(), 1);
+
+		let r2 = ctx
+			.client
+			.list_parts()
+			.bucket(&bucket)
+			.key("a")
+			.upload_id(uid)
+			.max_parts(1)
+			.part_number_marker(r.next_part_number_marker.as_ref().unwrap())
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(
+			r2.part_number_marker.as_ref().unwrap(),
+			r.next_part_number_marker.as_ref().unwrap()
+		);
+		assert_eq!(r2.max_parts, 1_i32);
+		assert!(r2.is_truncated);
+		assert_eq!(r2.key.unwrap(), "a");
+		assert_eq!(r2.upload_id.unwrap().as_str(), uid.as_str());
+		assert_eq!(r2.parts.unwrap().len(), 1);
+	}
+
+	let cmp = CompletedMultipartUpload::builder()
+		.parts(
+			CompletedPart::builder()
+				.part_number(1)
+				.e_tag(p2.e_tag.unwrap())
+				.build(),
+		)
+		.parts(
+			CompletedPart::builder()
+				.part_number(2)
+				.e_tag(p1.e_tag.unwrap())
+				.build(),
+		)
+		.build();
+
+	ctx.client
+		.complete_multipart_upload()
+		.bucket(&bucket)
+		.key("a")
+		.upload_id(uid)
+		.multipart_upload(cmp)
+		.send()
+		.await
+		.unwrap();
+
+	// The multipart upload must not appear anymore
+	assert!(ctx
+		.client
+		.list_parts()
+		.bucket(&bucket)
+		.key("a")
+		.upload_id(uid)
+		.send()
+		.await
+		.is_err());
+
+	{
+		// The object must appear as a regular object
+		let r = ctx
+			.client
+			.head_object()
+			.bucket(&bucket)
+			.key("a")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.content_length, (SZ_5MB * 2) as i64);
+	}
+}
+
+#[tokio::test]
+async fn test_uploadpartcopy() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("uploadpartcopy");
+
+	let u1 = vec![0x11; SZ_10MB];
+	let u2 = vec![0x22; SZ_5MB];
+	let u3 = vec![0x33; SZ_5MB];
+	let u4 = vec![0x44; SZ_5MB];
+	let u5 = vec![0x55; SZ_5MB];
+
+	let overflow = 5500000 - SZ_5MB;
+	let mut exp_obj = u3.clone();
+	exp_obj.extend(&u4[500..]);
+	exp_obj.extend(&u5[..overflow + 1]);
+	exp_obj.extend(&u2);
+	exp_obj.extend(&u1[500..5500000 + 1]);
+
+	// (setup) Upload a single part object
+	ctx.client
+		.put_object()
+		.bucket(&bucket)
+		.key("source1")
+		.body(ByteStream::from(u1))
+		.send()
+		.await
+		.unwrap();
+
+	// (setup) Upload a multipart object with 2 parts
+	{
+		let up = ctx
+			.client
+			.create_multipart_upload()
+			.bucket(&bucket)
+			.key("source2")
+			.send()
+			.await
+			.unwrap();
+		let uid = up.upload_id.as_ref().unwrap();
+
+		let p1 = ctx
+			.client
+			.upload_part()
+			.bucket(&bucket)
+			.key("source2")
+			.upload_id(uid)
+			.part_number(1)
+			.body(ByteStream::from(u4))
+			.send()
+			.await
+			.unwrap();
+
+		let p2 = ctx
+			.client
+			.upload_part()
+			.bucket(&bucket)
+			.key("source2")
+			.upload_id(uid)
+			.part_number(2)
+			.body(ByteStream::from(u5))
+			.send()
+			.await
+			.unwrap();
+
+		let cmp = CompletedMultipartUpload::builder()
+			.parts(
+				CompletedPart::builder()
+					.part_number(1)
+					.e_tag(p1.e_tag.unwrap())
+					.build(),
+			)
+			.parts(
+				CompletedPart::builder()
+					.part_number(2)
+					.e_tag(p2.e_tag.unwrap())
+					.build(),
+			)
+			.build();
+
+		ctx.client
+			.complete_multipart_upload()
+			.bucket(&bucket)
+			.key("source2")
+			.upload_id(uid)
+			.multipart_upload(cmp)
+			.send()
+			.await
+			.unwrap();
+	}
+
+	// Our multipart object that does copy
+	let up = ctx
+		.client
+		.create_multipart_upload()
+		.bucket(&bucket)
+		.key("target")
+		.send()
+		.await
+		.unwrap();
+	let uid = up.upload_id.as_ref().unwrap();
+
+	let p3 = ctx
+		.client
+		.upload_part()
+		.bucket(&bucket)
+		.key("target")
+		.upload_id(uid)
+		.part_number(3)
+		.body(ByteStream::from(u2))
+		.send()
+		.await
+		.unwrap();
+
+	let p1 = ctx
+		.client
+		.upload_part()
+		.bucket(&bucket)
+		.key("target")
+		.upload_id(uid)
+		.part_number(1)
+		.body(ByteStream::from(u3))
+		.send()
+		.await
+		.unwrap();
+
+	let p2 = ctx
+		.client
+		.upload_part_copy()
+		.bucket(&bucket)
+		.key("target")
+		.upload_id(uid)
+		.part_number(2)
+		.copy_source("uploadpartcopy/source2")
+		.copy_source_range("bytes=500-5500000")
+		.send()
+		.await
+		.unwrap();
+
+	let p4 = ctx
+		.client
+		.upload_part_copy()
+		.bucket(&bucket)
+		.key("target")
+		.upload_id(uid)
+		.part_number(4)
+		.copy_source("uploadpartcopy/source1")
+		.copy_source_range("bytes=500-5500000")
+		.send()
+		.await
+		.unwrap();
+
+	let cmp = CompletedMultipartUpload::builder()
+		.parts(
+			CompletedPart::builder()
+				.part_number(1)
+				.e_tag(p1.e_tag.unwrap())
+				.build(),
+		)
+		.parts(
+			CompletedPart::builder()
+				.part_number(2)
+				.e_tag(p2.copy_part_result.unwrap().e_tag.unwrap())
+				.build(),
+		)
+		.parts(
+			CompletedPart::builder()
+				.part_number(3)
+				.e_tag(p3.e_tag.unwrap())
+				.build(),
+		)
+		.parts(
+			CompletedPart::builder()
+				.part_number(4)
+				.e_tag(p4.copy_part_result.unwrap().e_tag.unwrap())
+				.build(),
+		)
+		.build();
+
+	ctx.client
+		.complete_multipart_upload()
+		.bucket(&bucket)
+		.key("target")
+		.upload_id(uid)
+		.multipart_upload(cmp)
+		.send()
+		.await
+		.unwrap();
+
+	// (check) Get object
+
+	let obj = ctx
+		.client
+		.get_object()
+		.bucket(&bucket)
+		.key("target")
+		.send()
+		.await
+		.unwrap();
+
+	let real_obj = obj
+		.body
+		.collect()
+		.await
+		.expect("Error reading data")
+		.into_bytes();
+
+	assert_eq!(real_obj.len(), exp_obj.len());
+	assert_eq!(real_obj, exp_obj);
+}
--- a/src/garage/tests/objects.rs
+++ b/src/garage/tests/objects.rs
@ -0,0 +1,266 @@
+use crate::common;
+use aws_sdk_s3::model::{Delete, ObjectIdentifier};
+use aws_sdk_s3::types::ByteStream;
+
+const STD_KEY: &str = "hello world";
+const CTRL_KEY: &str = "\x00\x01\x02\x00";
+const UTF8_KEY: &str = "\u{211D}\u{1F923}\u{1F44B}";
+const BODY: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+#[tokio::test]
+async fn test_putobject() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("putobject");
+
+	{
+		// Send an empty object (can serve as a directory marker)
+		// with a content type
+		let etag = "\"d41d8cd98f00b204e9800998ecf8427e\"";
+		let content_type = "text/csv";
+		let r = ctx
+			.client
+			.put_object()
+			.bucket(&bucket)
+			.key(STD_KEY)
+			.content_type(content_type)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.e_tag.unwrap().as_str(), etag);
+		// We return a version ID here
+		// We should check if Amazon is returning one when versioning is not enabled
+		assert!(r.version_id.is_some());
+
+		let _version = r.version_id.unwrap();
+
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(STD_KEY)
+			.send()
+			.await
+			.unwrap();
+
+		assert_bytes_eq!(o.body, b"");
+		assert_eq!(o.e_tag.unwrap(), etag);
+		// We do not return version ID
+		// We should check if Amazon is returning one when versioning is not enabled
+		// assert_eq!(o.version_id.unwrap(), _version);
+		assert_eq!(o.content_type.unwrap(), content_type);
+		assert!(o.last_modified.is_some());
+		assert_eq!(o.content_length, 0);
+		assert_eq!(o.parts_count, 0);
+		assert_eq!(o.tag_count, 0);
+	}
+
+	{
+		// Key with control characters,
+		// no content type and some data
+		let etag = "\"49f68a5c8493ec2c0bf489821c21fc3b\"";
+		let data = ByteStream::from_static(b"hi");
+
+		let r = ctx
+			.client
+			.put_object()
+			.bucket(&bucket)
+			.key(CTRL_KEY)
+			.body(data)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.e_tag.unwrap().as_str(), etag);
+		assert!(r.version_id.is_some());
+
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(CTRL_KEY)
+			.send()
+			.await
+			.unwrap();
+
+		assert_bytes_eq!(o.body, b"hi");
+		assert_eq!(o.e_tag.unwrap(), etag);
+		assert!(o.last_modified.is_some());
+		assert_eq!(o.content_length, 2);
+		assert_eq!(o.parts_count, 0);
+		assert_eq!(o.tag_count, 0);
+	}
+
+	{
+		// Key with UTF8 codepoints including emoji
+		let etag = "\"d41d8cd98f00b204e9800998ecf8427e\"";
+
+		let r = ctx
+			.client
+			.put_object()
+			.bucket(&bucket)
+			.key(UTF8_KEY)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(r.e_tag.unwrap().as_str(), etag);
+		assert!(r.version_id.is_some());
+
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(UTF8_KEY)
+			.send()
+			.await
+			.unwrap();
+
+		assert_bytes_eq!(o.body, b"");
+		assert_eq!(o.e_tag.unwrap(), etag);
+		assert!(o.last_modified.is_some());
+		assert_eq!(o.content_length, 0);
+		assert_eq!(o.parts_count, 0);
+		assert_eq!(o.tag_count, 0);
+	}
+}
+
+#[tokio::test]
+async fn test_getobject() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("getobject");
+
+	let etag = "\"46cf18a9b447991b450cad3facf5937e\"";
+	let data = ByteStream::from_static(BODY);
+
+	let r = ctx
+		.client
+		.put_object()
+		.bucket(&bucket)
+		.key(STD_KEY)
+		.body(data)
+		.send()
+		.await
+		.unwrap();
+
+	assert_eq!(r.e_tag.unwrap().as_str(), etag);
+
+	{
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(STD_KEY)
+			.range("bytes=1-9")
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(o.content_range.unwrap().as_str(), "bytes 1-9/62");
+		assert_bytes_eq!(o.body, &BODY[1..10]);
+	}
+	{
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(STD_KEY)
+			.range("bytes=9-")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(o.content_range.unwrap().as_str(), "bytes 9-61/62");
+		assert_bytes_eq!(o.body, &BODY[9..]);
+	}
+	{
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(STD_KEY)
+			.range("bytes=-5")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(o.content_range.unwrap().as_str(), "bytes 57-61/62");
+		assert_bytes_eq!(o.body, &BODY[57..]);
+	}
+}
+
+#[tokio::test]
+async fn test_deleteobject() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("deleteobject");
+
+	let mut to_del = Delete::builder();
+
+	// add content without data
+	for i in 0..5 {
+		let k = format!("k-{}", i);
+		ctx.client
+			.put_object()
+			.bucket(&bucket)
+			.key(k.to_string())
+			.send()
+			.await
+			.unwrap();
+		if i > 0 {
+			to_del = to_del.objects(ObjectIdentifier::builder().key(k).build());
+		}
+	}
+
+	// add content with data
+	for i in 0..5 {
+		let k = format!("l-{}", i);
+		let data = ByteStream::from_static(BODY);
+		ctx.client
+			.put_object()
+			.bucket(&bucket)
+			.key(k.to_string())
+			.body(data)
+			.send()
+			.await
+			.unwrap();
+
+		if i > 0 {
+			to_del = to_del.objects(ObjectIdentifier::builder().key(k).build());
+		}
+	}
+
+	ctx.client
+		.delete_object()
+		.bucket(&bucket)
+		.key("k-0")
+		.send()
+		.await
+		.unwrap();
+
+	ctx.client
+		.delete_object()
+		.bucket(&bucket)
+		.key("l-0")
+		.send()
+		.await
+		.unwrap();
+
+	let r = ctx
+		.client
+		.delete_objects()
+		.bucket(&bucket)
+		.delete(to_del.build())
+		.send()
+		.await
+		.unwrap();
+
+	assert_eq!(r.deleted.unwrap().len(), 8);
+
+	let l = ctx
+		.client
+		.list_objects_v2()
+		.bucket(&bucket)
+		.send()
+		.await
+		.unwrap();
+
+	assert!(l.contents.is_none());
+}
--- a/src/garage/tests/simple.rs
+++ b/src/garage/tests/simple.rs
@ -2,7 +2,7 @@ use crate::common;

 #[tokio::test]
 async fn test_simple() {
-	use aws_sdk_s3::ByteStream;
+	use aws_sdk_s3::types::ByteStream;

 	let ctx = common::context();
 	let bucket = ctx.create_bucket("test-simple");
--- a/src/garage/tests/streaming_signature.rs
+++ b/src/garage/tests/streaming_signature.rs
@ -0,0 +1,185 @@
+use std::collections::HashMap;
+
+use crate::common;
+use common::custom_requester::BodySignature;
+use hyper::Method;
+
+const STD_KEY: &str = "hello-world";
+//const CTRL_KEY: &str = "\x00\x01\x02\x00";
+const BODY: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+#[tokio::test]
+async fn test_putobject_streaming() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("putobject-streaming");
+
+	{
+		// Send an empty object (can serve as a directory marker)
+		// with a content type
+		let etag = "\"d41d8cd98f00b204e9800998ecf8427e\"";
+		let content_type = "text/csv";
+		let mut headers = HashMap::new();
+		headers.insert("content-type".to_owned(), content_type.to_owned());
+		let _ = ctx
+			.custom_request
+			.builder(bucket.clone())
+			.method(Method::PUT)
+			.path(STD_KEY.to_owned())
+			.unsigned_headers(headers)
+			.vhost_style(true)
+			.body(vec![])
+			.body_signature(BodySignature::Streaming(10))
+			.send()
+			.await
+			.unwrap();
+
+		// assert_eq!(r.e_tag.unwrap().as_str(), etag);
+		// We return a version ID here
+		// We should check if Amazon is returning one when versioning is not enabled
+		// assert!(r.version_id.is_some());
+
+		//let _version = r.version_id.unwrap();
+
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			.key(STD_KEY)
+			.send()
+			.await
+			.unwrap();
+
+		assert_bytes_eq!(o.body, b"");
+		assert_eq!(o.e_tag.unwrap(), etag);
+		// We do not return version ID
+		// We should check if Amazon is returning one when versioning is not enabled
+		// assert_eq!(o.version_id.unwrap(), _version);
+		assert_eq!(o.content_type.unwrap(), content_type);
+		assert!(o.last_modified.is_some());
+		assert_eq!(o.content_length, 0);
+		assert_eq!(o.parts_count, 0);
+		assert_eq!(o.tag_count, 0);
+	}
+
+	{
+		let etag = "\"46cf18a9b447991b450cad3facf5937e\"";
+
+		let _ = ctx
+			.custom_request
+			.builder(bucket.clone())
+			.method(Method::PUT)
+			//.path(CTRL_KEY.to_owned()) at the moment custom_request does not encode url so this
+			//fail
+			.path("abc".to_owned())
+			.vhost_style(true)
+			.body(BODY.to_vec())
+			.body_signature(BodySignature::Streaming(16))
+			.send()
+			.await
+			.unwrap();
+
+		// assert_eq!(r.e_tag.unwrap().as_str(), etag);
+		// assert!(r.version_id.is_some());
+
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(&bucket)
+			//.key(CTRL_KEY)
+			.key("abc")
+			.send()
+			.await
+			.unwrap();
+
+		assert_bytes_eq!(o.body, BODY);
+		assert_eq!(o.e_tag.unwrap(), etag);
+		assert!(o.last_modified.is_some());
+		assert_eq!(o.content_length, 62);
+		assert_eq!(o.parts_count, 0);
+		assert_eq!(o.tag_count, 0);
+	}
+}
+
+#[tokio::test]
+async fn test_create_bucket_streaming() {
+	let ctx = common::context();
+	let bucket = "createbucket-streaming";
+
+	{
+		// create bucket
+		let _ = ctx
+			.custom_request
+			.builder(bucket.to_owned())
+			.method(Method::PUT)
+			.body_signature(BodySignature::Streaming(10))
+			.send()
+			.await
+			.unwrap();
+
+		// test if the bucket exists and works properly
+		let etag = "\"d41d8cd98f00b204e9800998ecf8427e\"";
+		let content_type = "text/csv";
+		let _ = ctx
+			.client
+			.put_object()
+			.bucket(bucket)
+			.key(STD_KEY)
+			.content_type(content_type)
+			.send()
+			.await
+			.unwrap();
+
+		let o = ctx
+			.client
+			.get_object()
+			.bucket(bucket)
+			.key(STD_KEY)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(o.e_tag.unwrap(), etag);
+	}
+}
+
+#[tokio::test]
+async fn test_put_website_streaming() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("putwebsite-streaming");
+
+	{
+		let website_config = r#"<?xml version="1.0" encoding="UTF-8"?>
+<WebsiteConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+   <ErrorDocument>
+      <Key>err/error.html</Key>
+   </ErrorDocument>
+   <IndexDocument>
+      <Suffix>home.html</Suffix>
+   </IndexDocument>
+</WebsiteConfiguration>"#;
+
+		let mut query = HashMap::new();
+		query.insert("website".to_owned(), None);
+		let _ = ctx
+			.custom_request
+			.builder(bucket.clone())
+			.method(Method::PUT)
+			.query_params(query)
+			.body(website_config.as_bytes().to_vec())
+			.body_signature(BodySignature::Streaming(10))
+			.send()
+			.await
+			.unwrap();
+
+		let o = ctx
+			.client
+			.get_bucket_website()
+			.bucket(&bucket)
+			.send()
+			.await
+			.unwrap();
+
+		assert_eq!(o.index_document.unwrap().suffix.unwrap(), "home.html");
+		assert_eq!(o.error_document.unwrap().key.unwrap(), "err/error.html");
+	}
+}
--- a/src/garage/tests/website.rs
+++ b/src/garage/tests/website.rs
@ -0,0 +1,342 @@
+use crate::common;
+use crate::common::ext::*;
+use aws_sdk_s3::{
+	model::{CorsConfiguration, CorsRule, ErrorDocument, IndexDocument, WebsiteConfiguration},
+	types::ByteStream,
+};
+use http::Request;
+use hyper::{
+	body::{to_bytes, Body},
+	Client,
+};
+
+const BODY: &[u8; 16] = b"<h1>bonjour</h1>";
+const BODY_ERR: &[u8; 6] = b"erreur";
+
+#[tokio::test]
+async fn test_website() {
+	const BCKT_NAME: &str = "my-website";
+	let ctx = common::context();
+	let bucket = ctx.create_bucket(BCKT_NAME);
+
+	let data = ByteStream::from_static(BODY);
+
+	ctx.client
+		.put_object()
+		.bucket(&bucket)
+		.key("index.html")
+		.body(data)
+		.send()
+		.await
+		.unwrap();
+
+	let client = Client::new();
+
+	let req = || {
+		Request::builder()
+			.method("GET")
+			.uri(format!(
+				"http://127.0.0.1:{}/",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.body(Body::empty())
+			.unwrap()
+	};
+
+	let mut resp = client.request(req()).await.unwrap();
+
+	assert_eq!(resp.status(), 404);
+	assert_ne!(
+		to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+		BODY.as_ref()
+	); /* check that we do not leak body */
+
+	ctx.garage
+		.command()
+		.args(["bucket", "website", "--allow", BCKT_NAME])
+		.quiet()
+		.expect_success_status("Could not allow website on bucket");
+
+	resp = client.request(req()).await.unwrap();
+	assert_eq!(resp.status(), 200);
+	assert_eq!(
+		to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+		BODY.as_ref()
+	);
+
+	ctx.garage
+		.command()
+		.args(["bucket", "website", "--deny", BCKT_NAME])
+		.quiet()
+		.expect_success_status("Could not deny website on bucket");
+
+	resp = client.request(req()).await.unwrap();
+	assert_eq!(resp.status(), 404);
+	assert_ne!(
+		to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+		BODY.as_ref()
+	); /* check that we do not leak body */
+}
+
+#[tokio::test]
+async fn test_website_s3_api() {
+	const BCKT_NAME: &str = "my-cors";
+	let ctx = common::context();
+	let bucket = ctx.create_bucket(BCKT_NAME);
+
+	let data = ByteStream::from_static(BODY);
+
+	ctx.client
+		.put_object()
+		.bucket(&bucket)
+		.key("site/home.html")
+		.body(data)
+		.send()
+		.await
+		.unwrap();
+
+	ctx.client
+		.put_object()
+		.bucket(&bucket)
+		.key("err/error.html")
+		.body(ByteStream::from_static(BODY_ERR))
+		.send()
+		.await
+		.unwrap();
+
+	let conf = WebsiteConfiguration::builder()
+		.index_document(IndexDocument::builder().suffix("home.html").build())
+		.error_document(ErrorDocument::builder().key("err/error.html").build())
+		.build();
+
+	ctx.client
+		.put_bucket_website()
+		.bucket(&bucket)
+		.website_configuration(conf)
+		.send()
+		.await
+		.unwrap();
+
+	let cors = CorsConfiguration::builder()
+		.cors_rules(
+			CorsRule::builder()
+				.id("main-rule")
+				.allowed_headers("*")
+				.allowed_methods("GET")
+				.allowed_methods("PUT")
+				.allowed_origins("*")
+				.build(),
+		)
+		.build();
+
+	ctx.client
+		.put_bucket_cors()
+		.bucket(&bucket)
+		.cors_configuration(cors)
+		.send()
+		.await
+		.unwrap();
+
+	{
+		let cors_res = ctx
+			.client
+			.get_bucket_cors()
+			.bucket(&bucket)
+			.send()
+			.await
+			.unwrap();
+
+		let main_rule = cors_res.cors_rules().unwrap().iter().next().unwrap();
+
+		assert_eq!(main_rule.id.as_ref().unwrap(), "main-rule");
+		assert_eq!(
+			main_rule.allowed_headers.as_ref().unwrap(),
+			&vec!["*".to_string()]
+		);
+		assert_eq!(
+			main_rule.allowed_origins.as_ref().unwrap(),
+			&vec!["*".to_string()]
+		);
+		assert_eq!(
+			main_rule.allowed_methods.as_ref().unwrap(),
+			&vec!["GET".to_string(), "PUT".to_string()]
+		);
+	}
+
+	let client = Client::new();
+
+	// Test direct requests with CORS
+	{
+		let req = Request::builder()
+			.method("GET")
+			.uri(format!(
+				"http://127.0.0.1:{}/site/",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.header("Origin", "https://example.com")
+			.body(Body::empty())
+			.unwrap();
+
+		let mut resp = client.request(req).await.unwrap();
+
+		assert_eq!(resp.status(), 200);
+		assert_eq!(
+			resp.headers().get("access-control-allow-origin").unwrap(),
+			"*"
+		);
+		assert_eq!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY.as_ref()
+		);
+	}
+
+	// Test ErrorDocument on 404
+	{
+		let req = Request::builder()
+			.method("GET")
+			.uri(format!(
+				"http://127.0.0.1:{}/wrong.html",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.body(Body::empty())
+			.unwrap();
+
+		let mut resp = client.request(req).await.unwrap();
+
+		assert_eq!(resp.status(), 404);
+		assert_eq!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY_ERR.as_ref()
+		);
+	}
+
+	// Test CORS with an allowed preflight request
+	{
+		let req = Request::builder()
+			.method("OPTIONS")
+			.uri(format!(
+				"http://127.0.0.1:{}/site/",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.header("Origin", "https://example.com")
+			.header("Access-Control-Request-Method", "PUT")
+			.body(Body::empty())
+			.unwrap();
+
+		let mut resp = client.request(req).await.unwrap();
+
+		assert_eq!(resp.status(), 200);
+		assert_eq!(
+			resp.headers().get("access-control-allow-origin").unwrap(),
+			"*"
+		);
+		assert_ne!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY.as_ref()
+		);
+	}
+
+	// Test CORS with a forbidden preflight request
+	{
+		let req = Request::builder()
+			.method("OPTIONS")
+			.uri(format!(
+				"http://127.0.0.1:{}/site/",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.header("Origin", "https://example.com")
+			.header("Access-Control-Request-Method", "DELETE")
+			.body(Body::empty())
+			.unwrap();
+
+		let mut resp = client.request(req).await.unwrap();
+
+		assert_eq!(resp.status(), 403);
+		assert_ne!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY.as_ref()
+		);
+	}
+
+	//@TODO test CORS on the S3 endpoint. We need to handle auth manually to check it.
+
+	// Delete cors
+	ctx.client
+		.delete_bucket_cors()
+		.bucket(&bucket)
+		.send()
+		.await
+		.unwrap();
+
+	// Check CORS are deleted from the API
+	// @FIXME check what is the expected behavior when GetBucketCors is called on a bucket without
+	// any CORS.
+	assert!(ctx
+		.client
+		.get_bucket_cors()
+		.bucket(&bucket)
+		.send()
+		.await
+		.is_err());
+
+	// Test CORS are not sent anymore on a previously allowed request
+	{
+		let req = Request::builder()
+			.method("OPTIONS")
+			.uri(format!(
+				"http://127.0.0.1:{}/site/",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.header("Origin", "https://example.com")
+			.header("Access-Control-Request-Method", "PUT")
+			.body(Body::empty())
+			.unwrap();
+
+		let mut resp = client.request(req).await.unwrap();
+
+		assert_eq!(resp.status(), 403);
+		assert_ne!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY.as_ref()
+		);
+	}
+
+	// Disallow website from the API
+	ctx.client
+		.delete_bucket_website()
+		.bucket(&bucket)
+		.send()
+		.await
+		.unwrap();
+
+	// Check that the website is not served anymore
+	{
+		let req = Request::builder()
+			.method("GET")
+			.uri(format!(
+				"http://127.0.0.1:{}/site/",
+				common::garage::DEFAULT_PORT + 2
+			))
+			.header("Host", format!("{}.web.garage", BCKT_NAME))
+			.body(Body::empty())
+			.unwrap();
+
+		let mut resp = client.request(req).await.unwrap();
+
+		assert_eq!(resp.status(), 404);
+		assert_ne!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY_ERR.as_ref()
+		);
+		assert_ne!(
+			to_bytes(resp.body_mut()).await.unwrap().as_ref(),
+			BODY.as_ref()
+		);
+	}
+}
--- a/src/model/Cargo.toml
+++ b/src/model/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage_model"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -14,16 +14,17 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-garage_rpc = { version = "0.6.0", path = "../rpc" }
-garage_table = { version = "0.6.0", path = "../table" }
-garage_util = { version = "0.6.0", path = "../util" }
+garage_rpc = { version = "0.7.0", path = "../rpc" }
+garage_table = { version = "0.7.0", path = "../table" }
+garage_block = { version = "0.7.0", path = "../block" }
+garage_util = { version = "0.7.0", path = "../util" }
 garage_model_050 = { package = "garage_model", version = "0.5.1" }

 async-trait = "0.1.7"
 arc-swap = "1.0"
 err-derive = "0.3"
 hex = "0.4"
-log = "0.4"
+tracing = "0.1.30"
 rand = "0.8"
 zstd = { version = "0.9", default-features = false }

@ -36,6 +37,8 @@ serde_bytes = "0.11"
 futures = "0.3"
 futures-util = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
+opentelemetry = "0.17"

 #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-netapp = "0.3.0"
+#netapp = { version = "0.4", path = "../../../netapp" }
+netapp = "0.4"
--- a/src/model/block_ref_table.rs
+++ b/src/model/block_ref_table.rs
@ -6,7 +6,7 @@ use garage_util::data::*;
 use garage_table::crdt::Crdt;
 use garage_table::*;

-use crate::block::*;
+use garage_block::manager::*;

 #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
 pub struct BlockRef {
@ -52,7 +52,8 @@ impl TableSchema for BlockRefTable {
 	type Filter = DeletedFilter;

 	fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
-		let block = &old.as_ref().or_else(|| new.as_ref()).unwrap().block;
+		#[allow(clippy::or_fun_call)]
+		let block = &old.as_ref().or(new.as_ref()).unwrap().block;
 		let was_before = old.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
 		let is_after = new.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
 		if is_after && !was_before {
--- a/src/model/garage.rs
+++ b/src/model/garage.rs
@ -7,12 +7,12 @@ use garage_util::config::*;

 use garage_rpc::system::System;

+use garage_block::manager::*;
 use garage_table::replication::ReplicationMode;
 use garage_table::replication::TableFullReplication;
 use garage_table::replication::TableShardedReplication;
 use garage_table::*;

-use crate::block::*;
 use crate::block_ref_table::*;
 use crate::bucket_alias_table::*;
 use crate::bucket_table::*;
@ -86,8 +86,14 @@ impl Garage {
 		};

 		info!("Initialize block manager...");
-		let block_manager =
-			BlockManager::new(&db, config.data_dir.clone(), data_rep_param, system.clone());
+		let block_manager = BlockManager::new(
+			&db,
+			config.data_dir.clone(),
+			config.compression_level,
+			config.block_manager_background_tranquility,
+			data_rep_param,
+			system.clone(),
+		);

 		info!("Initialize block_ref_table...");
 		let block_ref_table = Table::new(
@ -136,7 +142,8 @@ impl Garage {
 		let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db);

 		info!("Initialize Garage...");
-		let garage = Arc::new(Self {
+
+		Arc::new(Self {
 			config,
 			db,
 			background,
@ -148,18 +155,7 @@ impl Garage {
 			object_table,
 			version_table,
 			block_ref_table,
-		});
-
-		info!("Start block manager background thread...");
-		garage.block_manager.garage.swap(Some(garage.clone()));
-		garage.block_manager.clone().spawn_background_worker();
-
-		garage
-	}
-
-	/// Use this for shutdown
-	pub fn break_reference_cycles(&self) {
-		self.block_manager.garage.swap(None);
+		})
 	}

 	pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
--- a/src/model/helper/bucket.rs
+++ b/src/model/helper/bucket.rs
@ -30,8 +30,7 @@ impl<'a> BucketHelper<'a> {
 		// the AWS spec, and hex-encoded UUIDs are 64 chars long.
 		let hexbucket = hex::decode(bucket_name.as_str())
 			.ok()
-			.map(|by| Uuid::try_from(&by))
-			.flatten();
+			.and_then(|by| Uuid::try_from(&by));
 		if let Some(bucket_id) = hexbucket {
 			Ok(self
 				.0
@ -46,8 +45,7 @@ impl<'a> BucketHelper<'a> {
 				.bucket_alias_table
 				.get(&EmptyKey, bucket_name)
 				.await?
-				.map(|x| *x.state.get())
-				.flatten())
+				.and_then(|x| *x.state.get()))
 		}
 	}

--- a/src/model/key_table.rs
+++ b/src/model/key_table.rs
@ -106,8 +106,7 @@ impl Key {
 	/// Get permissions for a bucket
 	pub fn bucket_permissions(&self, bucket: &Uuid) -> BucketKeyPerm {
 		self.params()
-			.map(|params| params.authorized_buckets.get(bucket))
-			.flatten()
+			.and_then(|params| params.authorized_buckets.get(bucket))
 			.cloned()
 			.unwrap_or(BucketKeyPerm::NO_PERMISSIONS)
 	}
--- a/src/model/lib.rs
+++ b/src/model/lib.rs
@ -1,5 +1,5 @@
 #[macro_use]
-extern crate log;
+extern crate tracing;

 pub mod permission;

@ -10,8 +10,6 @@ pub mod key_table;
 pub mod object_table;
 pub mod version_table;

-pub mod block;
-
 pub mod garage;
 pub mod helper;
 pub mod migrate;
--- a/src/rpc/Cargo.toml
+++ b/src/rpc/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage_rpc"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -14,13 +14,14 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-garage_util = { version = "0.6.0", path = "../util" }
+garage_util = { version = "0.7.0", path = "../util" }
+garage_admin = { version = "0.7.0", path = "../admin" }

 arc-swap = "1.0"
 bytes = "1.0"
 gethostname = "0.2"
 hex = "0.4"
-log = "0.4"
+tracing = "0.1.30"
 rand = "0.8"
 sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" }

@ -30,12 +31,26 @@ serde = { version = "1.0", default-features = false, features = ["derive", "rc"]
 serde_bytes = "0.11"
 serde_json = "1.0"

+# newer version requires rust edition 2021
+kube = { version = "0.62", features = ["runtime", "derive"], optional = true }
+k8s-openapi = { version = "0.13", features = ["v1_22"], optional = true }
+openssl = { version = "0.10", features = ["vendored"], optional = true }
+schemars = { version = "0.8", optional = true }
+
+# newer version requires rust edition 2021
+pnet_datalink = "0.28"
+
 futures = "0.3"
 futures-util = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
 tokio-stream = { version = "0.1", features = ["net"] }
+opentelemetry = "0.17"

 #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-netapp = "0.3.0"
+#netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] }
+netapp = { version = "0.4.2", features = ["telemetry"] }
+
 hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] }

+[features]
+kubernetes-discovery = [ "kube", "k8s-openapi", "openssl", "schemars" ]
--- a/src/rpc/consul.rs
+++ b/src/rpc/consul.rs
@ -51,10 +51,8 @@ pub async fn get_consul_nodes(
 		let pubkey = ent
 			.node_meta
 			.get("pubkey")
-			.map(|k| hex::decode(&k).ok())
-			.flatten()
-			.map(|k| NodeID::from_slice(&k[..]))
-			.flatten();
+			.and_then(|k| hex::decode(&k).ok())
+			.and_then(|k| NodeID::from_slice(&k[..]));
 		if let (Some(ip), Some(pubkey)) = (ip, pubkey) {
 			ret.push((pubkey, SocketAddr::new(ip, ent.service_port)));
 		} else {
@ -139,10 +137,10 @@ pub async fn publish_consul_service(
 	let resp = client.request(req).await?;
 	debug!("Response of advertising to Consul: {:?}", resp);
 	let resp_code = resp.status();
+	let resp_bytes = &hyper::body::to_bytes(resp.into_body()).await?;
 	debug!(
 		"{}",
-		std::str::from_utf8(&hyper::body::to_bytes(resp.into_body()).await?)
-			.unwrap_or("<invalid utf8>")
+		std::str::from_utf8(resp_bytes).unwrap_or("<invalid utf8>")
 	);

 	if resp_code != StatusCode::OK {
--- a/src/rpc/kubernetes.rs
+++ b/src/rpc/kubernetes.rs
@ -0,0 +1,114 @@
+use std::collections::BTreeMap;
+use std::net::{IpAddr, SocketAddr};
+
+use kube::{
+	api::{ListParams, Patch, PatchParams, PostParams},
+	Api, Client, CustomResource, CustomResourceExt,
+};
+
+use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+
+use netapp::NodeID;
+
+static K8S_GROUP: &str = "deuxfleurs.fr";
+
+#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
+#[kube(
+	group = "deuxfleurs.fr",
+	version = "v1",
+	kind = "GarageNode",
+	namespaced
+)]
+pub struct Node {
+	hostname: String,
+	address: IpAddr,
+	port: u16,
+}
+
+pub async fn create_kubernetes_crd() -> Result<(), kube::Error> {
+	let client = Client::try_default().await?;
+	let crds: Api<CustomResourceDefinition> = Api::all(client.clone());
+
+	let params = PatchParams::apply(&format!("garage.{}", K8S_GROUP));
+	let crd = GarageNode::crd();
+	let patch = Patch::Apply(crd);
+	crds.patch(&format!("garagenodes.{}", K8S_GROUP), &params, &patch)
+		.await?;
+
+	Ok(())
+}
+
+pub async fn get_kubernetes_nodes(
+	kubernetes_service_name: &str,
+	kubernetes_namespace: &str,
+) -> Result<Vec<(NodeID, SocketAddr)>, kube::Error> {
+	let client = Client::try_default().await?;
+	let nodes: Api<GarageNode> = Api::namespaced(client.clone(), kubernetes_namespace);
+
+	let lp = ListParams::default().labels(&format!(
+		"garage.{}/service={}",
+		K8S_GROUP, kubernetes_service_name
+	));
+
+	let nodes = nodes.list(&lp).await?;
+	let mut ret = Vec::with_capacity(nodes.items.len());
+
+	for node in nodes {
+		println!("Found Pod: {:?}", node.metadata.name);
+
+		let pubkey = &node
+			.metadata
+			.name
+			.and_then(|k| hex::decode(&k).ok())
+			.and_then(|k| NodeID::from_slice(&k[..]));
+
+		if let Some(pubkey) = pubkey {
+			ret.push((*pubkey, SocketAddr::new(node.spec.address, node.spec.port)))
+		}
+	}
+
+	Ok(ret)
+}
+
+pub async fn publish_kubernetes_node(
+	kubernetes_service_name: &str,
+	kubernetes_namespace: &str,
+	node_id: NodeID,
+	hostname: &str,
+	rpc_public_addr: SocketAddr,
+) -> Result<(), kube::Error> {
+	let node_pubkey = hex::encode(node_id);
+
+	let mut node = GarageNode::new(
+		&node_pubkey,
+		Node {
+			hostname: hostname.to_string(),
+			address: rpc_public_addr.ip(),
+			port: rpc_public_addr.port(),
+		},
+	);
+
+	let labels = node.metadata.labels.insert(BTreeMap::new());
+	labels.insert(
+		format!("garage.{}/service", K8S_GROUP),
+		kubernetes_service_name.to_string(),
+	);
+
+	debug!("Node object to be applied: {:#?}", node);
+
+	let client = Client::try_default().await?;
+	let nodes: Api<GarageNode> = Api::namespaced(client.clone(), kubernetes_namespace);
+
+	if let Ok(old_node) = nodes.get(&node_pubkey).await {
+		node.metadata.resource_version = old_node.metadata.resource_version;
+		nodes
+			.replace(&node_pubkey, &PostParams::default(), &node)
+			.await?;
+	} else {
+		nodes.create(&PostParams::default(), &node).await?;
+	};
+
+	Ok(())
+}
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@ -172,30 +172,43 @@ impl ClusterLayout {
 		println!("Calculating updated partition assignation, this may take some time...");
 		println!();

+		// Get old partition assignation
 		let old_partitions = self.parse_assignation_data();

-		let mut partitions = old_partitions.clone();
-		for part in partitions.iter_mut() {
-			part.nodes
-				.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
+		// Start new partition assignation with nodes from old assignation where it is relevant
+		let mut partitions = old_partitions
+			.iter()
+			.map(|old_part| {
+				let mut new_part = PartitionAss::new();
+				for node in old_part.nodes.iter() {
+					if let Some(role) = node.1 {
+						if role.capacity.is_some() {
+							new_part.add(None, n_zones, node.0, role);
 						}
+					}
+				}
+				new_part
+			})
+			.collect::<Vec<_>>();

-		// When nodes are removed, or when bootstraping an assignation from
-		// scratch for a new cluster, the old partitions will have holes (or be empty).
-		// Here we add more nodes to make a complete (sub-optimal) assignation,
+		// In various cases, not enough nodes will have been added for all partitions
+		// in the step above (e.g. due to node removals, or new zones being added).
+		// Here we add more nodes to make a complete (but sub-optimal) assignation,
 		// using an initial partition assignation that is calculated using the multi-dc maglev trick
 		match self.initial_partition_assignation() {
 			Some(initial_partitions) => {
 				for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) {
 					for (id, info) in ipart.nodes.iter() {
 						if part.nodes.len() < self.replication_factor {
-							part.add(part.nodes.len() + 1, n_zones, id, info.unwrap());
+							part.add(None, n_zones, id, info.unwrap());
 						}
 					}
 					assert!(part.nodes.len() == self.replication_factor);
 				}
 			}
 			None => {
+				// Not enough nodes in cluster to build a correct assignation.
+				// Signal it by returning an error.
 				return false;
 			}
 		}
@ -232,8 +245,13 @@ impl ClusterLayout {
 			let mut option = None;
 			for (i, part) in partitions.iter_mut().enumerate() {
 				for (irm, (idrm, _)) in part.nodes.iter().enumerate() {
-					let suprm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as i32
-						- target_partitions_per_node.get(*idrm).cloned().unwrap_or(0) as i32;
+					let errratio = |node, parts| {
+						let tgt = *target_partitions_per_node.get(node).unwrap() as f32;
+						(parts - tgt) / tgt
+					};
+					let square = |x| x * x;
+
+					let partsrm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as f32;

 					for (idadd, infoadd) in configured_nodes.iter() {
 						// skip replacing a node by itself
@ -242,14 +260,12 @@ impl ClusterLayout {
 							continue;
 						}

-						let supadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as i32
-							- target_partitions_per_node.get(*idadd).cloned().unwrap_or(0) as i32;
-
 						// We want to try replacing node idrm by node idadd
 						// if that brings us close to our goal.
-						let square = |i: i32| i * i;
-						let oldcost = square(suprm) + square(supadd);
-						let newcost = square(suprm - 1) + square(supadd + 1);
+						let partsadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as f32;
+						let oldcost = square(errratio(*idrm, partsrm) - errratio(*idadd, partsadd));
+						let newcost =
+							square(errratio(*idrm, partsrm - 1.) - errratio(*idadd, partsadd + 1.));
 						if newcost >= oldcost {
 							// not closer to our goal
 							continue;
@ -259,7 +275,7 @@ impl ClusterLayout {
 						let mut newpart = part.clone();

 						newpart.nodes.remove(irm);
-						if !newpart.add(newpart.nodes.len() + 1, n_zones, idadd, infoadd) {
+						if !newpart.add(None, n_zones, idadd, infoadd) {
 							continue;
 						}
 						assert!(newpart.nodes.len() == self.replication_factor);
@ -302,7 +318,9 @@ impl ClusterLayout {
 		// Show statistics
 		println!("New number of partitions per node:");
 		for (node, npart) in partitions_per_node.iter() {
-			println!("{:?}\t{}", node, npart);
+			let tgt = *target_partitions_per_node.get(node).unwrap();
+			let pct = 100f32 * (*npart as f32) / (tgt as f32);
+			println!("{:?}\t{}\t({}% of {})", node, npart, pct as i32, tgt);
 		}
 		println!();

@ -394,7 +412,7 @@ impl ClusterLayout {
 							continue;
 						}
 						for (pos2, &qv) in q.iter().enumerate().skip(*pos) {
-							if partitions[qv].add(rep + 1, n_zones, node_id, node_info) {
+							if partitions[qv].add(Some(rep + 1), n_zones, node_id, node_info) {
 								remaining -= 1;
 								*pos = pos2 + 1;
 								break;
@ -551,16 +569,27 @@ impl<'a> PartitionAss<'a> {
 		}
 	}

+	// add is a key function in creating a PartitionAss, i.e. the list of nodes
+	// to which a partition is assigned. It tries to add a certain node id to the
+	// assignation, but checks that doing so is compatible with the NECESSARY
+	// condition that the partition assignation must be dispersed over different
+	// zones (datacenters) if enough zones exist. This is why it takes a n_zones
+	// parameter, which is the total number of zones that have existing nodes:
+	// if nodes in the assignation already cover all n_zones zones, then any node
+	// that is not yet in the assignation can be added. Otherwise, only nodes
+	// that are in a new zone can be added.
 	fn add(
 		&mut self,
-		target_len: usize,
+		target_len: Option<usize>,
 		n_zones: usize,
 		node: &'a Uuid,
 		role: &'a NodeRole,
 	) -> bool {
-		if self.nodes.len() != target_len - 1 {
+		if let Some(tl) = target_len {
+			if self.nodes.len() != tl - 1 {
 				return false;
 			}
+		}

 		let p_zns = self
 			.nodes
--- a/src/rpc/lib.rs
+++ b/src/rpc/lib.rs
@ -1,14 +1,17 @@
 //! Crate containing rpc related functions and types used in Garage

 #[macro_use]
-extern crate log;
+extern crate tracing;

 mod consul;
+#[cfg(feature = "kubernetes-discovery")]
+mod kubernetes;

 pub mod layout;
 pub mod ring;
 pub mod system;

+mod metrics;
 pub mod rpc_helper;

 pub use rpc_helper::*;
--- a/src/rpc/metrics.rs
+++ b/src/rpc/metrics.rs
@ -0,0 +1,55 @@
+use std::sync::Arc;
+
+use opentelemetry::{global, metrics::*};
+use tokio::sync::Semaphore;
+
+/// TableMetrics reference all counter used for metrics
+pub struct RpcMetrics {
+	pub(crate) _rpc_available_permits: ValueObserver<u64>,
+
+	pub(crate) rpc_counter: Counter<u64>,
+	pub(crate) rpc_timeout_counter: Counter<u64>,
+	pub(crate) rpc_netapp_error_counter: Counter<u64>,
+	pub(crate) rpc_garage_error_counter: Counter<u64>,
+
+	pub(crate) rpc_duration: ValueRecorder<f64>,
+	pub(crate) rpc_queueing_time: ValueRecorder<f64>,
+}
+impl RpcMetrics {
+	pub fn new(sem: Arc<Semaphore>) -> Self {
+		let meter = global::meter("garage_rpc");
+		RpcMetrics {
+			_rpc_available_permits: meter
+				.u64_value_observer("rpc.available_permits", move |observer| {
+					observer.observe(sem.available_permits() as u64, &[])
+				})
+				.with_description("Number of available RPC permits")
+				.init(),
+
+			rpc_counter: meter
+				.u64_counter("rpc.request_counter")
+				.with_description("Number of RPC requests emitted")
+				.init(),
+			rpc_timeout_counter: meter
+				.u64_counter("rpc.timeout_counter")
+				.with_description("Number of RPC timeouts")
+				.init(),
+			rpc_netapp_error_counter: meter
+				.u64_counter("rpc.netapp_error_counter")
+				.with_description("Number of communication errors (errors in the Netapp library)")
+				.init(),
+			rpc_garage_error_counter: meter
+				.u64_counter("rpc.garage_error_counter")
+				.with_description("Number of RPC errors (errors happening when handling the RPC)")
+				.init(),
+			rpc_duration: meter
+				.f64_value_recorder("rpc.duration")
+				.with_description("Duration of RPCs")
+				.init(),
+			rpc_queueing_time: meter
+				.f64_value_recorder("rpc.queueing_time")
+				.with_description("Time RPC requests were queued for before being sent")
+				.init(),
+		}
+	}
+}
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@ -9,6 +9,12 @@ use futures_util::future::FutureExt;
 use tokio::select;
 use tokio::sync::{watch, Semaphore};

+use opentelemetry::KeyValue;
+use opentelemetry::{
+	trace::{FutureExt as OtelFutureExt, Span, TraceContextExt, Tracer},
+	Context,
+};
+
 pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc};
 use netapp::peering::fullmesh::FullMeshPeeringStrategy;
 pub use netapp::proto::*;
@ -17,7 +23,9 @@ pub use netapp::{NetApp, NodeID};
 use garage_util::background::BackgroundRunner;
 use garage_util::data::*;
 use garage_util::error::Error;
+use garage_util::metrics::RecordDuration;

+use crate::metrics::RpcMetrics;
 use crate::ring::Ring;

 const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
@ -76,7 +84,8 @@ struct RpcHelperInner {
 	fullmesh: Arc<FullMeshPeeringStrategy>,
 	background: Arc<BackgroundRunner>,
 	ring: watch::Receiver<Arc<Ring>>,
-	request_buffer_semaphore: Semaphore,
+	request_buffer_semaphore: Arc<Semaphore>,
+	metrics: RpcMetrics,
 }

 impl RpcHelper {
@ -86,12 +95,17 @@ impl RpcHelper {
 		background: Arc<BackgroundRunner>,
 		ring: watch::Receiver<Arc<Ring>>,
 	) -> Self {
+		let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE));
+
+		let metrics = RpcMetrics::new(sem.clone());
+
 		Self(Arc::new(RpcHelperInner {
 			our_node_id,
 			fullmesh,
 			background,
 			ring,
-			request_buffer_semaphore: Semaphore::new(REQUEST_BUFFER_SIZE),
+			request_buffer_semaphore: sem,
+			metrics,
 		}))
 	}

@ -120,21 +134,45 @@ impl RpcHelper {
 		M: Rpc<Response = Result<S, Error>>,
 		H: EndpointHandler<M>,
 	{
+		let metric_tags = [
+			KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
+			KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
+			KeyValue::new("to", format!("{:?}", to)),
+		];
+
 		let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32;
 		let permit = self
 			.0
 			.request_buffer_semaphore
 			.acquire_many(msg_size)
+			.record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags)
 			.await?;

+		self.0.metrics.rpc_counter.add(1, &metric_tags);
+
 		let node_id = to.into();
+		let rpc_call = endpoint
+			.call(&node_id, msg, strat.rs_priority)
+			.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
+
 		select! {
-			res = endpoint.call(&node_id, &msg, strat.rs_priority) => {
+			res = rpc_call => {
 				drop(permit);
-				Ok(res??)
+
+				if res.is_err() {
+					self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
+				}
+				let res = res?;
+
+				if res.is_err() {
+					self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
+				}
+
+				Ok(res?)
 			}
 			_ = tokio::time::sleep(strat.rs_timeout) => {
 				drop(permit);
+				self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
 				Err(Error::Timeout)
 			}
 		}
@ -195,7 +233,47 @@ impl RpcHelper {
 	where
 		M: Rpc<Response = Result<S, Error>> + 'static,
 		H: EndpointHandler<M> + 'static,
-		S: Send,
+		S: Send + 'static,
+	{
+		let quorum = strategy.rs_quorum.unwrap_or(to.len());
+
+		let tracer = opentelemetry::global::tracer("garage");
+		let span_name = if strategy.rs_interrupt_after_quorum {
+			format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len())
+		} else {
+			format!(
+				"RPC {} to {} (quorum {})",
+				endpoint.path(),
+				to.len(),
+				quorum
+			)
+		};
+		let mut span = tracer.start(span_name);
+		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
+		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
+		span.set_attribute(KeyValue::new("quorum", quorum as i64));
+		span.set_attribute(KeyValue::new(
+			"interrupt_after_quorum",
+			strategy.rs_interrupt_after_quorum.to_string(),
+		));
+
+		self.try_call_many_internal(endpoint, to, msg, strategy, quorum)
+			.with_context(Context::current_with_span(span))
+			.await
+	}
+
+	async fn try_call_many_internal<M, H, S>(
+		&self,
+		endpoint: &Arc<Endpoint<M, H>>,
+		to: &[Uuid],
+		msg: M,
+		strategy: RequestStrategy,
+		quorum: usize,
+	) -> Result<Vec<S>, Error>
+	where
+		M: Rpc<Response = Result<S, Error>> + 'static,
+		H: EndpointHandler<M> + 'static,
+		S: Send + 'static,
 	{
 		let msg = Arc::new(msg);

@ -210,7 +288,6 @@ impl RpcHelper {
 				self2.call_arc(&endpoint2, to, msg, strategy).await
 			})
 		});
-		let quorum = strategy.rs_quorum.unwrap_or(to.len());

 		// Vectors in which success results and errors will be collected
 		let mut successes = vec![];
@ -245,8 +322,7 @@ impl RpcHelper {
 					let peer_avg_ping = peer_list
 						.iter()
 						.find(|x| x.id.as_ref() == to.as_slice())
-						.map(|pi| pi.avg_ping)
-						.flatten()
+						.and_then(|pi| pi.avg_ping)
 						.unwrap_or_else(|| Duration::from_secs(1));
 					(
 						to != self.0.our_node_id,
@ -274,8 +350,12 @@ impl RpcHelper {
 				// If the current set of requests that are running is not enough to possibly
 				// reach quorum, start some new requests.
 				while successes.len() + resp_stream.len() < quorum {
-					if let Some((_, _, _, _to, fut)) = requests.next() {
-						resp_stream.push(fut);
+					if let Some((_, _, _, req_to, fut)) = requests.next() {
+						let tracer = opentelemetry::global::tracer("garage");
+						let span = tracer.start(format!("RPC to {:?}", req_to));
+						resp_stream.push(tokio::spawn(
+							fut.with_context(Context::current_with_span(span)),
+						));
 					} else {
 						// If we have no request to add, we know that we won't ever
 						// reach quorum: bail out now.
@ -285,7 +365,7 @@ impl RpcHelper {
 				assert!(!resp_stream.is_empty()); // because of loop invariants

 				// Wait for one request to terminate
-				match resp_stream.next().await.unwrap() {
+				match resp_stream.next().await.unwrap().unwrap() {
 					Ok(msg) => {
 						successes.push(msg);
 					}
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@ -1,7 +1,7 @@
 //! Module containing structs related to membership management
 use std::collections::HashMap;
 use std::io::{Read, Write};
-use std::net::SocketAddr;
+use std::net::{IpAddr, SocketAddr};
 use std::path::Path;
 use std::sync::{Arc, RwLock};
 use std::time::{Duration, Instant};
@ -29,6 +29,8 @@ use garage_util::persister::Persister;
 use garage_util::time::*;

 use crate::consul::*;
+#[cfg(feature = "kubernetes-discovery")]
+use crate::kubernetes::*;
 use crate::layout::*;
 use crate::ring::*;
 use crate::rpc_helper::*;
@ -37,6 +39,9 @@ const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
 const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
 const PING_TIMEOUT: Duration = Duration::from_secs(2);

+/// Version tag used for version check upon Netapp connection
+pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
+
 /// RPC endpoint used for calls related to membership
 pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";

@ -86,8 +91,11 @@ pub struct System {
 	rpc_listen_addr: SocketAddr,
 	rpc_public_addr: Option<SocketAddr>,
 	bootstrap_peers: Vec<(NodeID, SocketAddr)>,
-	consul_host: Option<String>,
-	consul_service_name: Option<String>,
+
+	consul_discovery: Option<ConsulDiscoveryParam>,
+	#[cfg(feature = "kubernetes-discovery")]
+	kubernetes_discovery: Option<KubernetesDiscoveryParam>,
+
 	replication_factor: usize,

 	/// The ring
@ -188,7 +196,10 @@ impl System {
 	) -> Arc<Self> {
 		let node_key =
 			gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
-		info!("Node public key: {}", hex::encode(&node_key.public_key()));
+		info!(
+			"Node ID of this node: {}",
+			hex::encode(&node_key.public_key()[..8])
+		);

 		let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout");
 		let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
@ -216,21 +227,53 @@ impl System {
 		let ring = Ring::new(cluster_layout, replication_factor);
 		let (update_ring, ring) = watch::channel(Arc::new(ring));

-		if let Some(addr) = config.rpc_public_addr {
-			println!("{}@{}", hex::encode(&node_key.public_key()), addr);
-		} else {
-			println!("{}", hex::encode(&node_key.public_key()));
+		let rpc_public_addr = match config.rpc_public_addr {
+			Some(a) => Some(a),
+			None => {
+				let addr =
+					get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
+				if let Some(a) = addr {
+					warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a);
 				}
+				addr
+			}
+		};

-		let netapp = NetApp::new(network_key, node_key);
+		let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key);
 		let fullmesh = FullMeshPeeringStrategy::new(
 			netapp.clone(),
 			config.bootstrap_peers.clone(),
-			config.rpc_public_addr,
+			rpc_public_addr,
 		);

 		let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());

+		let consul_discovery = match (&config.consul_host, &config.consul_service_name) {
+			(Some(ch), Some(csn)) => Some(ConsulDiscoveryParam {
+				consul_host: ch.to_string(),
+				service_name: csn.to_string(),
+			}),
+			_ => None,
+		};
+
+		#[cfg(feature = "kubernetes-discovery")]
+		let kubernetes_discovery = match (
+			&config.kubernetes_service_name,
+			&config.kubernetes_namespace,
+		) {
+			(Some(ksn), Some(kn)) => Some(KubernetesDiscoveryParam {
+				service_name: ksn.to_string(),
+				namespace: kn.to_string(),
+				skip_crd: config.kubernetes_skip_crd,
+			}),
+			_ => None,
+		};
+
+		#[cfg(not(feature = "kubernetes-discovery"))]
+		if config.kubernetes_service_name.is_some() || config.kubernetes_namespace.is_some() {
+			warn!("Kubernetes discovery is not enabled in this build.");
+		}
+
 		let sys = Arc::new(System {
 			id: netapp.id.into(),
 			persist_cluster_layout,
@ -243,10 +286,12 @@ impl System {
 			system_endpoint,
 			replication_factor,
 			rpc_listen_addr: config.rpc_bind_addr,
-			rpc_public_addr: config.rpc_public_addr,
+			rpc_public_addr,
 			bootstrap_peers: config.bootstrap_peers.clone(),
-			consul_host: config.consul_host.clone(),
-			consul_service_name: config.consul_service_name.clone(),
+			consul_discovery,
+			#[cfg(feature = "kubernetes-discovery")]
+			kubernetes_discovery,
+
 			ring,
 			update_ring: Mutex::new(update_ring),
 			background,
@ -270,23 +315,22 @@ impl System {
 	// ---- INTERNALS ----

 	async fn advertise_to_consul(self: Arc<Self>) -> Result<(), Error> {
-		let (consul_host, consul_service_name) =
-			match (&self.consul_host, &self.consul_service_name) {
-				(Some(ch), Some(csn)) => (ch, csn),
+		let c = match &self.consul_discovery {
+			Some(c) => c,
 			_ => return Ok(()),
 		};

 		let rpc_public_addr = match self.rpc_public_addr {
 			Some(addr) => addr,
 			None => {
-				warn!("Not advertising to Consul because rpc_public_addr is not defined in config file.");
+				warn!("Not advertising to Consul because rpc_public_addr is not defined in config file and could not be autodetected.");
 				return Ok(());
 			}
 		};

 		publish_consul_service(
-			consul_host,
-			consul_service_name,
+			&c.consul_host,
+			&c.service_name,
 			self.netapp.id,
 			&self.local_status.load_full().hostname,
 			rpc_public_addr,
@ -295,6 +339,32 @@ impl System {
 		.err_context("Error while publishing Consul service")
 	}

+	#[cfg(feature = "kubernetes-discovery")]
+	async fn advertise_to_kubernetes(self: Arc<Self>) -> Result<(), Error> {
+		let k = match &self.kubernetes_discovery {
+			Some(k) => k,
+			_ => return Ok(()),
+		};
+
+		let rpc_public_addr = match self.rpc_public_addr {
+			Some(addr) => addr,
+			None => {
+				warn!("Not advertising to Kubernetes because rpc_public_addr is not defined in config file and could not be autodetected.");
+				return Ok(());
+			}
+		};
+
+		publish_kubernetes_node(
+			&k.service_name,
+			&k.namespace,
+			self.netapp.id,
+			&self.local_status.load_full().hostname,
+			rpc_public_addr,
+		)
+		.await
+		.err_context("Error while publishing node to kubernetes")
+	}
+
 	/// Save network configuration to disc
 	async fn save_cluster_layout(self: Arc<Self>) -> Result<(), Error> {
 		let ring: Arc<Ring> = self.ring.borrow().clone();
@ -465,11 +535,6 @@ impl System {
 	}

 	async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
-		let consul_config = match (&self.consul_host, &self.consul_service_name) {
-			(Some(ch), Some(csn)) => Some((ch.clone(), csn.clone())),
-			_ => None,
-		};
-
 		while !*stop_signal.borrow() {
 			let not_configured = !self.ring.borrow().layout.check();
 			let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
@ -492,8 +557,8 @@ impl System {
 				}

 				// Fetch peer list from Consul
-				if let Some((consul_host, consul_service_name)) = &consul_config {
-					match get_consul_nodes(consul_host, consul_service_name).await {
+				if let Some(c) = &self.consul_discovery {
+					match get_consul_nodes(&c.consul_host, &c.service_name).await {
 						Ok(node_list) => {
 							ping_list.extend(node_list);
 						}
@ -503,6 +568,28 @@ impl System {
 					}
 				}

+				// Fetch peer list from Kubernetes
+				#[cfg(feature = "kubernetes-discovery")]
+				if let Some(k) = &self.kubernetes_discovery {
+					if !k.skip_crd {
+						match create_kubernetes_crd().await {
+							Ok(()) => (),
+							Err(e) => {
+								error!("Failed to create kubernetes custom resource: {}", e)
+							}
+						};
+					}
+
+					match get_kubernetes_nodes(&k.service_name, &k.namespace).await {
+						Ok(node_list) => {
+							ping_list.extend(node_list);
+						}
+						Err(e) => {
+							warn!("Could not retrieve node list from Kubernetes: {}", e);
+						}
+					}
+				}
+
 				for (node_id, node_addr) in ping_list {
 					tokio::spawn(
 						self.netapp
@ -519,6 +606,10 @@ impl System {

 			self.background.spawn(self.clone().advertise_to_consul());

+			#[cfg(feature = "kubernetes-discovery")]
+			self.background
+				.spawn(self.clone().advertise_to_kubernetes());
+
 			let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
 			select! {
 				_ = restart_at.fuse() => {},
@ -580,3 +671,23 @@ impl EndpointHandler<SystemRpc> for System {
 		}
 	}
 }
+
+fn get_default_ip() -> Option<IpAddr> {
+	pnet_datalink::interfaces()
+		.iter()
+		.find(|e| e.is_up() && !e.is_loopback() && !e.ips.is_empty())
+		.and_then(|e| e.ips.first())
+		.map(|a| a.ip())
+}
+
+struct ConsulDiscoveryParam {
+	consul_host: String,
+	service_name: String,
+}
+
+#[cfg(feature = "kubernetes-discovery")]
+struct KubernetesDiscoveryParam {
+	service_name: String,
+	namespace: String,
+	skip_crd: bool,
+}
--- a/src/table/Cargo.toml
+++ b/src/table/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage_table"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -14,13 +14,15 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-garage_rpc = { version = "0.6.0", path = "../rpc" }
-garage_util = { version = "0.6.0", path = "../util" }
+garage_rpc = { version = "0.7.0", path = "../rpc" }
+garage_util = { version = "0.7.0", path = "../util" }
+
+opentelemetry = "0.17"

 async-trait = "0.1.7"
 bytes = "1.0"
 hexdump = "0.1"
-log = "0.4"
+tracing = "0.1.30"
 rand = "0.8"

 sled = "0.34"
--- a/src/table/data.rs
+++ b/src/table/data.rs
@ -1,18 +1,19 @@
 use core::borrow::Borrow;
 use std::sync::Arc;

-use log::warn;
 use serde_bytes::ByteBuf;
 use sled::Transactional;
 use tokio::sync::Notify;

 use garage_util::data::*;
 use garage_util::error::*;
+use garage_util::sled_counter::SledCountedTree;

 use garage_rpc::system::System;

 use crate::crdt::Crdt;
 use crate::gc::GcTodoEntry;
+use crate::metrics::*;
 use crate::replication::*;
 use crate::schema::*;

@ -27,7 +28,9 @@ pub struct TableData<F: TableSchema, R: TableReplication> {
 	pub(crate) merkle_tree: sled::Tree,
 	pub(crate) merkle_todo: sled::Tree,
 	pub(crate) merkle_todo_notify: Notify,
-	pub(crate) gc_todo: sled::Tree,
+	pub(crate) gc_todo: SledCountedTree,
+
+	pub(crate) metrics: TableMetrics,
 }

 impl<F, R> TableData<F, R>
@ -50,6 +53,9 @@ where
 		let gc_todo = db
 			.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
 			.expect("Unable to open DB tree");
+		let gc_todo = SledCountedTree::new(gc_todo);
+
+		let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone(), gc_todo.clone());

 		Arc::new(Self {
 			system,
@ -60,6 +66,7 @@ where
 			merkle_todo,
 			merkle_todo_notify: Notify::new(),
 			gc_todo,
+			metrics,
 		})
 	}

@ -165,6 +172,8 @@ where
 		})?;

 		if let Some((old_entry, new_entry, new_bytes_hash)) = changed {
+			self.metrics.internal_update_counter.add(1);
+
 			let is_tombstone = new_entry.is_tombstone();
 			self.instance.updated(old_entry, Some(new_entry));
 			self.merkle_todo_notify.notify_one();
@ -199,6 +208,8 @@ where
 		})?;

 		if removed {
+			self.metrics.internal_delete_counter.add(1);
+
 			let old_entry = self.decode_entry(v)?;
 			self.instance.updated(Some(old_entry), None);
 			self.merkle_todo_notify.notify_one();
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@ -14,6 +14,7 @@ use tokio::sync::watch;

 use garage_util::data::*;
 use garage_util::error::*;
+use garage_util::sled_counter::SledCountedTree;
 use garage_util::time::*;

 use garage_rpc::system::System;
@ -362,7 +363,7 @@ impl GcTodoEntry {
 	}

 	/// Saves the GcTodoEntry in the gc_todo tree
-	pub(crate) fn save(&self, gc_todo_tree: &sled::Tree) -> Result<(), Error> {
+	pub(crate) fn save(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> {
 		gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?;
 		Ok(())
 	}
@ -372,7 +373,7 @@ impl GcTodoEntry {
 	/// This is usefull to remove a todo entry only under the condition
 	/// that it has not changed since the time it was read, i.e.
 	/// what we have to do is still the same
-	pub(crate) fn remove_if_equal(&self, gc_todo_tree: &sled::Tree) -> Result<(), Error> {
+	pub(crate) fn remove_if_equal(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> {
 		let _ = gc_todo_tree.compare_and_swap::<_, _, Vec<u8>>(
 			&self.todo_table_key()[..],
 			Some(self.value_hash),
--- a/src/table/lib.rs
+++ b/src/table/lib.rs
@ -2,8 +2,9 @@
 #![allow(clippy::comparison_chain)]

 #[macro_use]
-extern crate log;
+extern crate tracing;

+mod metrics;
 pub mod schema;
 pub mod util;

--- a/src/table/merkle.rs
+++ b/src/table/merkle.rs
@ -3,7 +3,6 @@ use std::time::Duration;

 use futures::select;
 use futures_util::future::*;
-use log::{debug, warn};
 use serde::{Deserialize, Serialize};
 use sled::transaction::{
 	ConflictableTransactionError, ConflictableTransactionResult, TransactionalTree,
--- a/src/table/metrics.rs
+++ b/src/table/metrics.rs
@ -0,0 +1,96 @@
+use opentelemetry::{global, metrics::*, KeyValue};
+
+use garage_util::sled_counter::SledCountedTree;
+
+/// TableMetrics reference all counter used for metrics
+pub struct TableMetrics {
+	pub(crate) _merkle_todo_len: ValueObserver<u64>,
+	pub(crate) _gc_todo_len: ValueObserver<u64>,
+
+	pub(crate) get_request_counter: BoundCounter<u64>,
+	pub(crate) get_request_duration: BoundValueRecorder<f64>,
+	pub(crate) put_request_counter: BoundCounter<u64>,
+	pub(crate) put_request_duration: BoundValueRecorder<f64>,
+
+	pub(crate) internal_update_counter: BoundCounter<u64>,
+	pub(crate) internal_delete_counter: BoundCounter<u64>,
+
+	pub(crate) sync_items_sent: Counter<u64>,
+	pub(crate) sync_items_received: Counter<u64>,
+}
+impl TableMetrics {
+	pub fn new(
+		table_name: &'static str,
+		merkle_todo: sled::Tree,
+		gc_todo: SledCountedTree,
+	) -> Self {
+		let meter = global::meter(table_name);
+		TableMetrics {
+			_merkle_todo_len: meter
+				.u64_value_observer(
+					"table.merkle_updater_todo_queue_length",
+					move |observer| {
+						observer.observe(
+							merkle_todo.len() as u64,
+							&[KeyValue::new("table_name", table_name)],
+						)
+					},
+				)
+				.with_description("Merkle tree updater TODO queue length")
+				.init(),
+			_gc_todo_len: meter
+				.u64_value_observer(
+					"table.gc_todo_queue_length",
+					move |observer| {
+						observer.observe(
+							gc_todo.len() as u64,
+							&[KeyValue::new("table_name", table_name)],
+						)
+					},
+				)
+				.with_description("Table garbage collector TODO queue length")
+				.init(),
+
+			get_request_counter: meter
+				.u64_counter("table.get_request_counter")
+				.with_description("Number of get/get_range requests internally made on this table")
+				.init()
+				.bind(&[KeyValue::new("table_name", table_name)]),
+			get_request_duration: meter
+				.f64_value_recorder("table.get_request_duration")
+				.with_description("Duration of get/get_range requests internally made on this table, in seconds")
+				.init()
+				.bind(&[KeyValue::new("table_name", table_name)]),
+			put_request_counter: meter
+				.u64_counter("table.put_request_counter")
+				.with_description("Number of insert/insert_many requests internally made on this table")
+				.init()
+				.bind(&[KeyValue::new("table_name", table_name)]),
+			put_request_duration: meter
+				.f64_value_recorder("table.put_request_duration")
+				.with_description("Duration of insert/insert_many requests internally made on this table, in seconds")
+				.init()
+				.bind(&[KeyValue::new("table_name", table_name)]),
+
+			internal_update_counter: meter
+				.u64_counter("table.internal_update_counter")
+				.with_description("Number of value updates where the value actually changes (includes creation of new key and update of existing key)")
+				.init()
+				.bind(&[KeyValue::new("table_name", table_name)]),
+			internal_delete_counter: meter
+				.u64_counter("table.internal_delete_counter")
+				.with_description("Number of value deletions in the tree (due to GC or repartitioning)")
+				.init()
+				.bind(&[KeyValue::new("table_name", table_name)]),
+
+			sync_items_sent: meter
+				.u64_counter("table.sync_items_sent")
+				.with_description("Number of data items sent to other nodes during resync procedures")
+				.init(),
+			sync_items_received: meter
+				.u64_counter("table.sync_items_received")
+				.with_description("Number of data items received from other nodes during resync procedures")
+				.init(),
+		}
+	}
+}
--- a/src/table/replication/mode.rs
+++ b/src/table/replication/mode.rs
@ -1,7 +1,10 @@
 pub enum ReplicationMode {
 	None,
 	TwoWay,
+	TwoWayDangerous,
 	ThreeWay,
+	ThreeWayDegraded,
+	ThreeWayDangerous,
 }

 impl ReplicationMode {
@ -9,7 +12,10 @@ impl ReplicationMode {
 		match v {
 			"none" | "1" => Some(Self::None),
 			"2" => Some(Self::TwoWay),
+			"2-dangerous" => Some(Self::TwoWayDangerous),
 			"3" => Some(Self::ThreeWay),
+			"3-degraded" => Some(Self::ThreeWayDegraded),
+			"3-dangerous" => Some(Self::ThreeWayDangerous),
 			_ => None,
 		}
 	}
@ -24,16 +30,17 @@ impl ReplicationMode {
 	pub fn replication_factor(&self) -> usize {
 		match self {
 			Self::None => 1,
-			Self::TwoWay => 2,
-			Self::ThreeWay => 3,
+			Self::TwoWay | Self::TwoWayDangerous => 2,
+			Self::ThreeWay | Self::ThreeWayDegraded | Self::ThreeWayDangerous => 3,
 		}
 	}

 	pub fn read_quorum(&self) -> usize {
 		match self {
 			Self::None => 1,
-			Self::TwoWay => 1,
+			Self::TwoWay | Self::TwoWayDangerous => 1,
 			Self::ThreeWay => 2,
+			Self::ThreeWayDegraded | Self::ThreeWayDangerous => 1,
 		}
 	}

@ -41,7 +48,9 @@ impl ReplicationMode {
 		match self {
 			Self::None => 1,
 			Self::TwoWay => 2,
-			Self::ThreeWay => 2,
+			Self::TwoWayDangerous => 1,
+			Self::ThreeWay | Self::ThreeWayDegraded => 2,
+			Self::ThreeWayDangerous => 1,
 		}
 	}
 }
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@ -6,6 +6,7 @@ use async_trait::async_trait;
 use futures::select;
 use futures_util::future::*;
 use futures_util::stream::*;
+use opentelemetry::KeyValue;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use serde_bytes::ByteBuf;
@ -312,6 +313,16 @@ where
 	) -> Result<(), Error> {
 		let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();

+		for to in nodes.iter() {
+			self.data.metrics.sync_items_sent.add(
+				values.len() as u64,
+				&[
+					KeyValue::new("table_name", F::TABLE_NAME),
+					KeyValue::new("to", format!("{:?}", to)),
+				],
+			);
+		}
+
 		self.system
 			.rpc
 			.try_call_many(
@ -500,6 +511,14 @@ where
 			.map(|x| Arc::new(ByteBuf::from(x)))
 			.collect::<Vec<_>>();

+		self.data.metrics.sync_items_sent.add(
+			values.len() as u64,
+			&[
+				KeyValue::new("table_name", F::TABLE_NAME),
+				KeyValue::new("to", format!("{:?}", who)),
+			],
+		);
+
 		let rpc_resp = self
 			.system
 			.rpc
@ -527,7 +546,7 @@ where
 	F: TableSchema + 'static,
 	R: TableReplication + 'static,
 {
-	async fn handle(self: &Arc<Self>, message: &SyncRpc, _from: NodeID) -> Result<SyncRpc, Error> {
+	async fn handle(self: &Arc<Self>, message: &SyncRpc, from: NodeID) -> Result<SyncRpc, Error> {
 		match message {
 			SyncRpc::RootCkHash(range, h) => {
 				let (_root_ck_key, root_ck) = self.get_root_ck(*range)?;
@ -539,6 +558,17 @@ where
 				Ok(SyncRpc::Node(k.clone(), node))
 			}
 			SyncRpc::Items(items) => {
+				self.data.metrics.sync_items_received.add(
+					items.len() as u64,
+					&[
+						KeyValue::new("table_name", F::TABLE_NAME),
+						KeyValue::new(
+							"from",
+							format!("{:?}", Uuid::try_from(from.as_ref()).unwrap()),
+						),
+					],
+				);
+
 				self.data.update_many(items)?;
 				Ok(SyncRpc::Ok)
 			}
--- a/src/table/table.rs
+++ b/src/table/table.rs
@ -7,8 +7,14 @@ use futures::stream::*;
 use serde::{Deserialize, Serialize};
 use serde_bytes::ByteBuf;

+use opentelemetry::{
+	trace::{FutureExt, TraceContextExt, Tracer},
+	Context,
+};
+
 use garage_util::data::*;
 use garage_util::error::Error;
+use garage_util::metrics::RecordDuration;

 use garage_rpc::system::System;
 use garage_rpc::*;
@ -81,6 +87,20 @@ where
 	}

 	pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
+		let tracer = opentelemetry::global::tracer("garage_table");
+		let span = tracer.start(format!("{} insert", F::TABLE_NAME));
+
+		self.insert_internal(e)
+			.bound_record_duration(&self.data.metrics.put_request_duration)
+			.with_context(Context::current_with_span(span))
+			.await?;
+
+		self.data.metrics.put_request_counter.add(1);
+
+		Ok(())
+	}
+
+	async fn insert_internal(&self, e: &F::E) -> Result<(), Error> {
 		let hash = e.partition_key().hash();
 		let who = self.data.replication.write_nodes(&hash);
 		//eprintln!("insert who: {:?}", who);
@ -99,10 +119,25 @@ where
 					.with_timeout(TABLE_RPC_TIMEOUT),
 			)
 			.await?;
+
 		Ok(())
 	}

 	pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> {
+		let tracer = opentelemetry::global::tracer("garage_table");
+		let span = tracer.start(format!("{} insert_many {}", F::TABLE_NAME, entries.len()));
+
+		self.insert_many_internal(entries)
+			.bound_record_duration(&self.data.metrics.put_request_duration)
+			.with_context(Context::current_with_span(span))
+			.await?;
+
+		self.data.metrics.put_request_counter.add(1);
+
+		Ok(())
+	}
+
+	async fn insert_many_internal(&self, entries: &[F::E]) -> Result<(), Error> {
 		let mut call_list: HashMap<_, Vec<_>> = HashMap::new();

 		for entry in entries.iter() {
@ -148,10 +183,28 @@ where
 		self: &Arc<Self>,
 		partition_key: &F::P,
 		sort_key: &F::S,
+	) -> Result<Option<F::E>, Error> {
+		let tracer = opentelemetry::global::tracer("garage_table");
+		let span = tracer.start(format!("{} get", F::TABLE_NAME));
+
+		let res = self
+			.get_internal(partition_key, sort_key)
+			.bound_record_duration(&self.data.metrics.get_request_duration)
+			.with_context(Context::current_with_span(span))
+			.await?;
+
+		self.data.metrics.get_request_counter.add(1);
+
+		Ok(res)
+	}
+
+	async fn get_internal(
+		self: &Arc<Self>,
+		partition_key: &F::P,
+		sort_key: &F::S,
 	) -> Result<Option<F::E>, Error> {
 		let hash = partition_key.hash();
 		let who = self.data.replication.read_nodes(&hash);
-		//eprintln!("get who: {:?}", who);

 		let rpc = TableRpc::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
 		let resps = self
@ -198,6 +251,7 @@ where
 					.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
 			}
 		}
+
 		Ok(ret)
 	}

@ -207,6 +261,27 @@ where
 		begin_sort_key: Option<F::S>,
 		filter: Option<F::Filter>,
 		limit: usize,
+	) -> Result<Vec<F::E>, Error> {
+		let tracer = opentelemetry::global::tracer("garage_table");
+		let span = tracer.start(format!("{} get_range", F::TABLE_NAME));
+
+		let res = self
+			.get_range_internal(partition_key, begin_sort_key, filter, limit)
+			.bound_record_duration(&self.data.metrics.get_request_duration)
+			.with_context(Context::current_with_span(span))
+			.await?;
+
+		self.data.metrics.get_request_counter.add(1);
+
+		Ok(res)
+	}
+
+	async fn get_range_internal(
+		self: &Arc<Self>,
+		partition_key: &F::P,
+		begin_sort_key: Option<F::S>,
+		filter: Option<F::Filter>,
+		limit: usize,
 	) -> Result<Vec<F::E>, Error> {
 		let hash = partition_key.hash();
 		let who = self.data.replication.read_nodes(&hash);
--- a/src/util/Cargo.toml
+++ b/src/util/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage_util"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -18,7 +18,7 @@ blake2 = "0.9"
 err-derive = "0.3"
 xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] }
 hex = "0.4"
-log = "0.4"
+tracing = "0.1.30"
 rand = "0.8"
 sha2 = "0.9"

@ -34,7 +34,10 @@ futures = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

 #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-netapp = "0.3.0"
+#netapp = { version = "0.4", path = "../../../netapp" }
+netapp = "0.4"

 http = "0.2"
 hyper = "0.14"
+
+opentelemetry = { version = "0.17", features = [ "rt-tokio", "metrics", "trace" ] }
--- a/src/util/config.rs
+++ b/src/util/config.rs
@ -23,6 +23,10 @@ pub struct Config {
 	#[serde(default = "default_block_size")]
 	pub block_size: usize,

+	/// Size of data blocks to save to disk
+	#[serde(default = "default_block_manager_background_tranquility")]
+	pub block_manager_background_tranquility: u32,
+
 	/// Replication mode. Supported values:
 	/// - none, 1 -> no replication
 	/// - 2 -> 2-way replication
@ -52,6 +56,13 @@ pub struct Config {
 	pub consul_host: Option<String>,
 	/// Consul service name to use
 	pub consul_service_name: Option<String>,
+	/// Kubernetes namespace the service discovery resources are be created in
+	pub kubernetes_namespace: Option<String>,
+	/// Service name to filter for in k8s custom resources
+	pub kubernetes_service_name: Option<String>,
+	/// Skip creation of the garagenodes CRD
+	#[serde(default)]
+	pub kubernetes_skip_crd: bool,

 	/// Sled cache size, in bytes
 	#[serde(default = "default_sled_cache_capacity")]
@ -66,6 +77,10 @@ pub struct Config {

 	/// Configuration for serving files as normal web server
 	pub s3_web: WebConfig,
+
+	/// Configuration for the admin API endpoint
+	#[serde(default = "Default::default")]
+	pub admin: AdminConfig,
 }

 /// Configuration for S3 api
@ -89,6 +104,15 @@ pub struct WebConfig {
 	pub root_domain: String,
 }

+/// Configuration for the admin and monitoring HTTP API
+#[derive(Deserialize, Debug, Clone, Default)]
+pub struct AdminConfig {
+	/// Address and port to bind for admin API serving
+	pub api_bind_addr: Option<SocketAddr>,
+	/// OTLP server to where to export traces
+	pub trace_sink: Option<String>,
+}
+
 fn default_sled_cache_capacity() -> u64 {
 	128 * 1024 * 1024
 }
@ -98,6 +122,9 @@ fn default_sled_flush_every_ms() -> u64 {
 fn default_block_size() -> usize {
 	1048576
 }
+fn default_block_manager_background_tranquility() -> u32 {
+	2
+}

 /// Read and parse configuration
 pub fn read_config(config_file: PathBuf) -> Result<Config, Error> {
--- a/src/util/data.rs
+++ b/src/util/data.rs
@ -22,7 +22,7 @@ impl std::convert::AsRef<[u8]> for FixedBytes32 {

 impl fmt::Debug for FixedBytes32 {
 	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-		write!(f, "{}…", hex::encode(&self.0[..8]))
+		write!(f, "{}", hex::encode(&self.0[..8]))
 	}
 }

--- a/src/util/lib.rs
+++ b/src/util/lib.rs
@ -1,14 +1,16 @@
 //! Crate containing common functions and types used in Garage

 #[macro_use]
-extern crate log;
+extern crate tracing;

 pub mod background;
 pub mod config;
 pub mod crdt;
 pub mod data;
 pub mod error;
+pub mod metrics;
 pub mod persister;
+pub mod sled_counter;
 pub mod time;
 pub mod token_bucket;
 pub mod tranquilizer;
--- a/src/util/metrics.rs
+++ b/src/util/metrics.rs
@ -0,0 +1,57 @@
+use std::time::SystemTime;
+
+use futures::{future::BoxFuture, Future, FutureExt};
+use rand::Rng;
+
+use opentelemetry::{metrics::*, trace::TraceId, KeyValue};
+
+pub trait RecordDuration<'a>: 'a {
+	type Output;
+
+	fn record_duration(
+		self,
+		r: &'a ValueRecorder<f64>,
+		attributes: &'a [KeyValue],
+	) -> BoxFuture<'a, Self::Output>;
+	fn bound_record_duration(self, r: &'a BoundValueRecorder<f64>) -> BoxFuture<'a, Self::Output>;
+}
+
+impl<'a, T, O> RecordDuration<'a> for T
+where
+	T: Future<Output = O> + Send + 'a,
+{
+	type Output = O;
+
+	fn record_duration(
+		self,
+		r: &'a ValueRecorder<f64>,
+		attributes: &'a [KeyValue],
+	) -> BoxFuture<'a, Self::Output> {
+		async move {
+			let request_start = SystemTime::now();
+			let res = self.await;
+			r.record(
+				request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()),
+				attributes,
+			);
+			res
+		}
+		.boxed()
+	}
+
+	fn bound_record_duration(self, r: &'a BoundValueRecorder<f64>) -> BoxFuture<'a, Self::Output> {
+		async move {
+			let request_start = SystemTime::now();
+			let res = self.await;
+			r.record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+			res
+		}
+		.boxed()
+	}
+}
+
+// ----
+
+pub fn gen_trace_id() -> TraceId {
+	rand::thread_rng().gen::<[u8; 16]>().into()
+}
--- a/src/util/sled_counter.rs
+++ b/src/util/sled_counter.rs
@ -0,0 +1,100 @@
+use std::sync::{
+	atomic::{AtomicUsize, Ordering},
+	Arc,
+};
+
+use sled::{CompareAndSwapError, IVec, Iter, Result, Tree};
+
+#[derive(Clone)]
+pub struct SledCountedTree(Arc<SledCountedTreeInternal>);
+
+struct SledCountedTreeInternal {
+	tree: Tree,
+	len: AtomicUsize,
+}
+
+impl SledCountedTree {
+	pub fn new(tree: Tree) -> Self {
+		let len = tree.len();
+		Self(Arc::new(SledCountedTreeInternal {
+			tree,
+			len: AtomicUsize::new(len),
+		}))
+	}
+
+	pub fn len(&self) -> usize {
+		self.0.len.load(Ordering::Relaxed)
+	}
+
+	pub fn is_empty(&self) -> bool {
+		self.0.tree.is_empty()
+	}
+
+	pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<IVec>> {
+		self.0.tree.get(key)
+	}
+
+	pub fn iter(&self) -> Iter {
+		self.0.tree.iter()
+	}
+
+	// ---- writing functions ----
+
+	pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<IVec>>
+	where
+		K: AsRef<[u8]>,
+		V: Into<IVec>,
+	{
+		let res = self.0.tree.insert(key, value);
+		if res == Ok(None) {
+			self.0.len.fetch_add(1, Ordering::Relaxed);
+		}
+		res
+	}
+
+	pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<IVec>> {
+		let res = self.0.tree.remove(key);
+		if matches!(res, Ok(Some(_))) {
+			self.0.len.fetch_sub(1, Ordering::Relaxed);
+		}
+		res
+	}
+
+	pub fn pop_min(&self) -> Result<Option<(IVec, IVec)>> {
+		let res = self.0.tree.pop_min();
+		if let Ok(Some(_)) = &res {
+			self.0.len.fetch_sub(1, Ordering::Relaxed);
+		};
+		res
+	}
+
+	pub fn compare_and_swap<K, OV, NV>(
+		&self,
+		key: K,
+		old: Option<OV>,
+		new: Option<NV>,
+	) -> Result<std::result::Result<(), CompareAndSwapError>>
+	where
+		K: AsRef<[u8]>,
+		OV: AsRef<[u8]>,
+		NV: Into<IVec>,
+	{
+		let old_some = old.is_some();
+		let new_some = new.is_some();
+
+		let res = self.0.tree.compare_and_swap(key, old, new);
+
+		if res == Ok(Ok(())) {
+			match (old_some, new_some) {
+				(false, true) => {
+					self.0.len.fetch_add(1, Ordering::Relaxed);
+				}
+				(true, false) => {
+					self.0.len.fetch_sub(1, Ordering::Relaxed);
+				}
+				_ => (),
+			}
+		}
+		res
+	}
+}
--- a/src/web/Cargo.toml
+++ b/src/web/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "garage_web"
-version = "0.6.0"
+version = "0.7.0"
 authors = ["Alex Auvolat <alex@adnab.me>", "Quentin Dufour <quentin@dufour.io>"]
 edition = "2018"
 license = "AGPL-3.0"
@ -14,16 +14,18 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-garage_api = { version = "0.6.0", path = "../api" }
-garage_model = { version = "0.6.0", path = "../model" }
-garage_util = { version = "0.6.0", path = "../util" }
-garage_table = { version = "0.6.0", path = "../table" }
+garage_api = { version = "0.7.0", path = "../api" }
+garage_model = { version = "0.7.0", path = "../model" }
+garage_util = { version = "0.7.0", path = "../util" }
+garage_table = { version = "0.7.0", path = "../table" }

 err-derive = "0.3"
-log = "0.4"
+tracing = "0.1.30"
 percent-encoding = "2.1.0"

 futures = "0.3"

 http = "0.2"
 hyper = { version = "0.14", features = ["server", "http1", "runtime", "tcp", "stream"] }
+
+opentelemetry = "0.17"
--- a/src/web/lib.rs
+++ b/src/web/lib.rs
@ -1,6 +1,6 @@
 //! Crate for handling web serving of s3 bucket
 #[macro_use]
-extern crate log;
+extern crate tracing;

 mod error;
 pub use error::Error;
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
Alex Auvolat	dffcd9f4b1	update Cargo.nix	2022-04-08 14:35:09 +02:00
Alex Auvolat	5d404dcd54	Add missing opentelemetry features	2022-04-08 14:21:04 +02:00
Quentin Dufour	62f0715abe	Add/Fix OpenTelemetry	2022-04-07 16:12:35 +02:00
Quentin Dufour	7e1ac51b58	Add files to quickly test k8s	2022-04-07 16:12:35 +02:00
Alex Auvolat	94f1e48fff	Update to netapp 0.4.2 (a tiny fix)	2022-04-07 11:50:03 +02:00
Alex Auvolat	cb5836d53c	Bring maximum exponential backoff time down from 16h to 1h	2022-04-07 11:49:29 +02:00
Quentin Dufour	8e3ee82c3e	Be clearer on what upgrades are (not) supported	2022-04-06 21:45:59 +02:00
Quentin Dufour	a122a8cb46	Add an "upgrading" section, add a guide for 0.7	2022-04-05 10:08:31 +02:00
Quentin Dufour	9fd8ec1dee	Add documentation for winscp+sftpgo	2022-03-31 10:25:56 +02:00
Alex Auvolat	0091002ef2	New replication modes and their documentation	2022-03-28 16:26:04 +02:00
Alex Auvolat	8f9cf3a5d1	fix a clippy lint	2022-03-28 15:48:55 +02:00
Alex Auvolat	913f7754bb	Add blocks in errored state to `garage stats`	2022-03-28 15:47:23 +02:00
Alex Auvolat	42dde54126	Log admin GET requests at debug level instead of info to reduce noise in logs	2022-03-28 15:46:52 +02:00
Alex Auvolat	dca2ffdf91	document administrative options	2022-03-28 12:26:08 +02:00
Quentin Dufour	0cf4efac89	Compile kuberetes-discovery only when release=true	2022-03-24 16:57:43 +01:00
Alex Auvolat	9d0ed78887	Add feature flag for Kubernetes discovery	2022-03-24 16:57:43 +01:00
Alex Auvolat	509d256c58	Make layout optimization work in relative terms	2022-03-24 15:27:14 +01:00
Alex Auvolat	2814d41842	Allow `garage layout assign` to assign to several nodes at once	2022-03-24 15:27:13 +01:00
Alex Auvolat	7e0e2ffda2	Slight change and add comment to layout assignation algo	2022-03-24 15:27:13 +01:00
Alex Auvolat	413ab0eaed	Small change to partition assignation algorithm This change helps ensure that nodes for each partition are spread over all datacenters, a property that wasn't ensured previously when going from a 2 DC deployment to a 3 DC deployment	2022-03-24 15:27:10 +01:00
Alex Auvolat	43945234ae	Add missing src/block to toplevel cargo.toml	2022-03-23 10:26:10 +01:00
Alex Auvolat	3dc9214172	Add lots of comments on how the resync queue works (I don't really want to change/refactor that code though)	2022-03-23 10:25:39 +01:00
Alex Auvolat	077dd1cde9	Clippy	2022-03-23 10:25:39 +01:00
Alex Auvolat	2d13f0aa13	run cargo2nix	2022-03-23 10:25:37 +01:00
Alex Auvolat	e480aaf338	Make background tranquility a configurable parameter	2022-03-23 10:25:19 +01:00
Alex Auvolat	8fd6745745	Move block RC code to separate `rc.rs`	2022-03-23 10:25:19 +01:00
Alex Auvolat	c3982a90b6	Move DataBlock out of manager.rs	2022-03-23 10:25:19 +01:00
Alex Auvolat	c1d9854d2c	Move block manager to separate module	2022-03-23 10:25:15 +01:00
trinity-1686a	8565f7dc31	cleanup	2022-03-23 10:22:37 +01:00
trinity-1686a	8db6b84559	add test for create bucket and put website with streaming signature	2022-03-23 10:22:37 +01:00
trinity-1686a	1eb7fdb08f	add test framework for arbitraty S3 requests and implement some basic test with it	2022-03-23 10:22:36 +01:00
KokaKiwi	e934934f14	garage_api: Update streaming payload stream unit tests	2022-03-23 10:22:36 +01:00
KokaKiwi	98545a16dd	garage_api: Handle streaming payload early in request handling	2022-03-23 10:22:36 +01:00
Alex Auvolat	822128e3c8	Talk a bit about capacity balancing between regions	2022-03-22 12:07:13 +01:00
Rune Henriksen	aea8b41728	document request routing logic	2022-03-21 12:03:57 +01:00
Rune Henriksen	71e6645e09	add short tutorial for duplicati usage with garage	2022-03-21 11:58:19 +01:00
Steam	15da2156f6	Change position of the node-id argument	2022-03-19 18:03:23 +01:00
Quentin Dufour	0529f3c34d	Patch cargo2nix openssl override	2022-03-17 12:17:38 +01:00
Alex Auvolat	db46cdef79	Update netapp to v0.4.1	2022-03-15 17:09:57 +01:00
Alex Auvolat	ba6b56ae68	Fix some new clippy lints	2022-03-14 12:27:49 +01:00
Alex Auvolat	0af314b295	Add comment for fsync	2022-03-14 11:54:00 +01:00
Alex Auvolat	d78bf379fb	Fix resync queue to not drop items	2022-03-14 11:51:37 +01:00
Alex Auvolat	f7e6f4616f	Spawn a single resync worker	2022-03-14 11:51:37 +01:00
Alex Auvolat	dc5ec4ecf9	Add appropriate fsync() calls in write_block to ensure that data is persisted properly	2022-03-14 11:51:32 +01:00
Alex Auvolat	fe62d01b7e	Implement exponential backoff for resync retries	2022-03-14 11:41:20 +01:00
Alex Auvolat	bfb4353df5	Update Grafana dashboard	2022-03-14 10:55:30 +01:00
Alex Auvolat	9b2b531f4d	Make admin server optional	2022-03-14 10:54:25 +01:00
Alex Auvolat	a19341b188	Add Grafana dashboard for Garage	2022-03-14 10:54:25 +01:00
Alex Auvolat	2377a92f6b	Add wrapper over sled tree to count items (used for big queues)	2022-03-14 10:54:25 +01:00
Alex Auvolat	203e8d2c34	Bump version to 0.7 because of incompatible Netapp	2022-03-14 10:54:24 +01:00
Alex Auvolat	f869ca625d	Add spans to table calls, change span names in RPC	2022-03-14 10:54:12 +01:00
Alex Auvolat	0cc31ee169	add missing netapp telemetry feature	2022-03-14 10:54:11 +01:00
Alex Auvolat	dc8d0496cc	Refactoring: rename config files, make modifications less invasive	2022-03-14 10:53:51 +01:00
Alex Auvolat	d9a35359bf	Add metrics to web endpoint	2022-03-14 10:53:50 +01:00
Alex Auvolat	2a5609b292	Add metrics to API endpoint	2022-03-14 10:53:36 +01:00
Alex Auvolat	818daa5c78	Refactor how durations are measured	2022-03-14 10:53:35 +01:00
Alex Auvolat	f0d0cd9a20	Remove strum crate dependency; add protobuf nix dependency	2022-03-14 10:53:00 +01:00
Alex Auvolat	55d4471599	Remove ... at end of hex IDs	2022-03-14 10:52:31 +01:00
Alex Auvolat	bb04d94fa9	Update to Netapp 0.4 which supports distributed tracing	2022-03-14 10:52:30 +01:00
Alex Auvolat	8c2fb0c066	Add tracing integration with opentelemetry	2022-03-14 10:52:13 +01:00
Maximilien	b6561f6e1b	Add docker-compose for traces & metrics	2022-03-14 10:51:52 +01:00
Alex Auvolat	2cab84b1fe	Add many metrics in table/ and rpc/	2022-03-14 10:51:50 +01:00
Maximilien R	1e2cf26373	Implement basic metrics in table	2022-03-14 10:51:17 +01:00
mricher	e349af13a7	Update dependencies and add admin module with metrics - Global dependencies updated in Cargo.lock - New module created in src/admin to host: - the (future) admin REST API - the metric collection - add configuration block No metrics implemented yet	2022-03-14 10:51:12 +01:00
Max Audron	9d44127245	add support for kubernetes service discovery This commit adds support to discover garage instances running in kubernetes. Once enabled by setting `kubernetes_namespace` and `kubernetes_service_name` garage will create a Custom Resources `garagenodes.deuxfleurs.fr` with nodes public key as the resource name. and IP and Port information as spec in the namespace configured by `kubernetes_namespace`. For discovering nodes the resources are filtered with the optionally set `kubernetes_service_name` which sets a label `garage.deuxfleurs.fr/service` on the resources. This allows to separate multiple garage deployments in a single namespace. the `kubernetes_skip_crd` variable allows to disable the creation of the CRD by garage itself. The user must deploy this manually.	2022-03-12 13:05:52 +01:00
Quentin Dufour	c00b2c9948	Functional tests for admin commands	2022-03-07 17:32:07 +01:00
Quentin Dufour	8df1e186de	Functional tests for website endpoints	2022-03-07 17:32:07 +01:00
Quentin Dufour	2ef60b8417	Functional test for multipart endpoints	2022-03-07 17:32:07 +01:00
Quentin Dufour	1e639ec67c	Functional test for ListMultipartUploads	2022-03-07 17:32:07 +01:00
Quentin Dufour	cfea1e0315	Functional tests for bucket endpoints	2022-03-07 17:32:02 +01:00
Quentin Dufour	05eb79929e	Functional tests for object operations	2022-03-07 17:05:10 +01:00
Quentin Dufour	0f4e0e8bb9	Move ListObjects tests to Rust	2022-03-07 17:05:10 +01:00
Quentin Dufour	2a3afcaf65	Test WinSCP	2022-03-03 14:29:10 +01:00
Alex Auvolat	8a5bbc3b0b	More permissive OPTIONS on S3 API	2022-03-01 11:15:16 +01:00
Alex Auvolat	97f245f218	Add tracing output to signature calculation	2022-02-28 12:22:39 +01:00
Alex Auvolat	8129a98291	Process CORS earlier in pipeline	2022-02-28 12:22:39 +01:00
Quentin Dufour	54e02b4c3b	Force static builds for all platforms	2022-02-24 16:12:37 +01:00