Merge pull request 'metadata db snapshotting' (#775) from db-snapshot into main
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
All checks were successful
ci/woodpecker/push/debug Pipeline was successful
ci/woodpecker/cron/release/3 Pipeline was successful
ci/woodpecker/cron/release/2 Pipeline was successful
ci/woodpecker/cron/debug Pipeline was successful
ci/woodpecker/cron/release/1 Pipeline was successful
ci/woodpecker/cron/release/4 Pipeline was successful
ci/woodpecker/cron/publish Pipeline was successful
Reviewed-on: #775
This commit is contained in:
commit
fd2e19bf1b
21 changed files with 380 additions and 11 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1438,6 +1438,7 @@ dependencies = [
|
||||||
"garage_util",
|
"garage_util",
|
||||||
"hex",
|
"hex",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
|
"parse_duration",
|
||||||
"rand",
|
"rand",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_bytes",
|
"serde_bytes",
|
||||||
|
|
|
@ -34,7 +34,7 @@ args@{
|
||||||
ignoreLockHash,
|
ignoreLockHash,
|
||||||
}:
|
}:
|
||||||
let
|
let
|
||||||
nixifiedLockHash = "8112e20b0e356bed77a9769600c2b2952662ec8af9548eecf8a2d46fe8433189";
|
nixifiedLockHash = "f99156ba9724d370b33258f076f078fefc945f0af79292b1a246bd48bef2a9b2";
|
||||||
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
|
||||||
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
|
||||||
lockHashIgnored = if ignoreLockHash
|
lockHashIgnored = if ignoreLockHash
|
||||||
|
@ -2093,6 +2093,7 @@ in
|
||||||
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out;
|
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out;
|
||||||
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
|
||||||
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
|
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
|
||||||
|
parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out;
|
||||||
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
|
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
|
||||||
serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out;
|
serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out;
|
||||||
serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out;
|
serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out;
|
||||||
|
@ -4769,6 +4770,7 @@ in
|
||||||
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
registry = "registry+https://github.com/rust-lang/crates.io-index";
|
||||||
src = fetchCratesIo { inherit name version; sha256 = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"; };
|
src = fetchCratesIo { inherit name version; sha256 = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"; };
|
||||||
features = builtins.concatLists [
|
features = builtins.concatLists [
|
||||||
|
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "backup")
|
||||||
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "bundled")
|
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "bundled")
|
||||||
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "modern_sqlite")
|
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "modern_sqlite")
|
||||||
];
|
];
|
||||||
|
|
|
@ -72,13 +72,14 @@ to store 2 TB of data in total.
|
||||||
to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md).
|
to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md).
|
||||||
|
|
||||||
- For the metadata storage, Garage does not do checksumming and integrity
|
- For the metadata storage, Garage does not do checksumming and integrity
|
||||||
verification on its own. Users have reported that when using the LMDB
|
verification on its own, so it is better to use a robust filesystem such as
|
||||||
database engine (the default), database files have a tendency of becoming
|
BTRFS or ZFS. Users have reported that when using the LMDB database engine
|
||||||
corrupted after an unclean shutdown (e.g. a power outage), so you should use
|
(the default), database files have a tendency of becoming corrupted after an
|
||||||
a robust filesystem such as BTRFS or ZFS for the metadata partition, and take
|
unclean shutdown (e.g. a power outage), so you should take regular snapshots
|
||||||
regular snapshots so that you can restore to a recent known-good state in
|
to be able to recover from such a situation. This can be done using Garage's
|
||||||
case of an incident. If you cannot do so, you might want to switch to Sqlite
|
built-in automatic snapshotting (since v0.9.4), or by using filesystem level
|
||||||
which is more robust.
|
snapshots. If you cannot do so, you might want to switch to Sqlite which is
|
||||||
|
more robust.
|
||||||
|
|
||||||
- LMDB is the fastest and most tested database engine, but it has the following
|
- LMDB is the fastest and most tested database engine, but it has the following
|
||||||
weaknesses: 1/ data files are not architecture-independent, you cannot simply
|
weaknesses: 1/ data files are not architecture-independent, you cannot simply
|
||||||
|
@ -124,6 +125,7 @@ A valid `/etc/garage.toml` for our cluster would look as follows:
|
||||||
metadata_dir = "/var/lib/garage/meta"
|
metadata_dir = "/var/lib/garage/meta"
|
||||||
data_dir = "/var/lib/garage/data"
|
data_dir = "/var/lib/garage/data"
|
||||||
db_engine = "lmdb"
|
db_engine = "lmdb"
|
||||||
|
metadata_auto_snapshot_interval = "6h"
|
||||||
|
|
||||||
replication_mode = "3"
|
replication_mode = "3"
|
||||||
|
|
||||||
|
|
|
@ -104,6 +104,24 @@ operation will also move out all data from locations marked as read-only.
|
||||||
|
|
||||||
# Metadata operations
|
# Metadata operations
|
||||||
|
|
||||||
|
## Metadata snapshotting
|
||||||
|
|
||||||
|
It is good practice to setup automatic snapshotting of your metadata database
|
||||||
|
file, to recover from situations where it becomes corrupted on disk. This can
|
||||||
|
be done at the filesystem level if you are using ZFS or BTRFS.
|
||||||
|
|
||||||
|
Since Garage v0.9.4, Garage is able to take snapshots of the metadata database
|
||||||
|
itself. This basically amounts to copying the database file, except that it can
|
||||||
|
be run live while Garage is running without the risk of corruption or
|
||||||
|
inconsistencies. This can be setup to run automatically on a schedule using
|
||||||
|
[`metadata_auto_snapshot_interval`](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval).
|
||||||
|
A snapshot can also be triggered manually using the `garage meta snapshot`
|
||||||
|
command. Note that taking a snapshot using this method is very intensive as it
|
||||||
|
requires making a full copy of the database file, so you might prefer using
|
||||||
|
filesystem-level snapshots if possible. To recover a corrupted node from such a
|
||||||
|
snapshot, read the instructions
|
||||||
|
[here](@/documentation/operations/recovering.md#corrupted_meta).
|
||||||
|
|
||||||
## Metadata table resync
|
## Metadata table resync
|
||||||
|
|
||||||
Garage automatically resyncs all entries stored in the metadata tables every hour,
|
Garage automatically resyncs all entries stored in the metadata tables every hour,
|
||||||
|
|
|
@ -108,3 +108,57 @@ garage layout apply # once satisfied, apply the changes
|
||||||
|
|
||||||
Garage will then start synchronizing all required data on the new node.
|
Garage will then start synchronizing all required data on the new node.
|
||||||
This process can be monitored using the `garage stats -a` command.
|
This process can be monitored using the `garage stats -a` command.
|
||||||
|
|
||||||
|
## Replacement scenario 3: corrupted metadata {#corrupted_meta}
|
||||||
|
|
||||||
|
In some cases, your metadata DB file might become corrupted, for instance if
|
||||||
|
your node suffered a power outage and did not shut down properly. In this case,
|
||||||
|
you can recover without having to change the node ID and rebuilding a cluster
|
||||||
|
layout. This means that data blocks will not need to be shuffled around, you
|
||||||
|
must simply find a way to repair the metadata file. The best way is generally
|
||||||
|
to discard the corrupted file and recover it from another source.
|
||||||
|
|
||||||
|
First of all, start by locating the database file in your metadata directory,
|
||||||
|
which [depends on your `db_engine`
|
||||||
|
choice](@/documentation/reference-manual/configuration.md#db_engine). Then,
|
||||||
|
your recovery options are as follows:
|
||||||
|
|
||||||
|
- **Option 1: resyncing from other nodes.** In case your cluster is replicated
|
||||||
|
with two or three copies, you can simply delete the database file, and Garage
|
||||||
|
will resync from other nodes. To do so, stop Garage, delete the database file
|
||||||
|
or directory, and restart Garage. Then, do a full table repair by calling
|
||||||
|
`garage repair -a --yes tables`. This will take a bit of time to complete as
|
||||||
|
the new node will need to receive copies of the metadata tables from the
|
||||||
|
network.
|
||||||
|
|
||||||
|
- **Option 2: restoring a snapshot taken by Garage.** Since v0.9.4, Garage can
|
||||||
|
[automatically take regular
|
||||||
|
snapshots](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval)
|
||||||
|
of your metadata DB file. This file or directory should be located under
|
||||||
|
`<metadata_dir>/snapshots`, and is named according to the UTC time at which it
|
||||||
|
was taken. Stop Garage, discard the database file/directory and replace it by the
|
||||||
|
snapshot you want to use. For instance, in the case of LMDB:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd $METADATA_DIR
|
||||||
|
mv db.lmdb db.lmdb.bak
|
||||||
|
cp -r snapshots/2024-03-15T12:13:52Z db.lmdb
|
||||||
|
```
|
||||||
|
|
||||||
|
And for Sqlite:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd $METADATA_DIR
|
||||||
|
mv db.sqlite db.sqlite.bak
|
||||||
|
cp snapshots/2024-03-15T12:13:52Z db.sqlite
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, restart Garage and run a full table repair by calling `garage repair -a
|
||||||
|
--yes tables`. This should run relatively fast as only the changes that
|
||||||
|
occurred since the snapshot was taken will need to be resynchronized. Of
|
||||||
|
course, if your cluster is not replicated, you will lose all changes that
|
||||||
|
occurred since the snapshot was taken.
|
||||||
|
|
||||||
|
- **Option 3: restoring a filesystem-level snapshot.** If you are using ZFS or
|
||||||
|
BTRFS to snapshot your metadata partition, refer to their specific
|
||||||
|
documentation on rolling back or copying files from an old snapshot.
|
||||||
|
|
|
@ -73,6 +73,18 @@ The entire procedure would look something like this:
|
||||||
You can do all of the nodes in a single zone at once as that won't impact global cluster availability.
|
You can do all of the nodes in a single zone at once as that won't impact global cluster availability.
|
||||||
Do not try to make a backup of the metadata folder of a running node.
|
Do not try to make a backup of the metadata folder of a running node.
|
||||||
|
|
||||||
|
**Since Garage v0.9.4,** you can use the `garage meta snapshot --all` command
|
||||||
|
to take a simultaneous snapshot of the metadata database files of all your
|
||||||
|
nodes. This avoids the tedious process of having to take them down one by
|
||||||
|
one before upgrading. Be careful that if automatic snapshotting is enabled,
|
||||||
|
Garage only keeps the last two snapshots and deletes older ones, so you might
|
||||||
|
want to disable automatic snapshotting in your upgraded configuration file
|
||||||
|
until you have confirmed that the upgrade ran successfully. In addition to
|
||||||
|
snapshotting the metadata databases of your nodes, you should back-up at
|
||||||
|
least the `cluster_layout` file of one of your Garage instances (this file
|
||||||
|
should be the same on all nodes and you can copy it safely while Garage is
|
||||||
|
running).
|
||||||
|
|
||||||
3. Prepare your binaries and configuration files for the new Garage version
|
3. Prepare your binaries and configuration files for the new Garage version
|
||||||
|
|
||||||
4. Restart all nodes simultaneously in the new version
|
4. Restart all nodes simultaneously in the new version
|
||||||
|
|
|
@ -15,6 +15,7 @@ data_dir = "/var/lib/garage/data"
|
||||||
metadata_fsync = true
|
metadata_fsync = true
|
||||||
data_fsync = false
|
data_fsync = false
|
||||||
disable_scrub = false
|
disable_scrub = false
|
||||||
|
metadata_auto_snapshot_interval = "6h"
|
||||||
|
|
||||||
db_engine = "lmdb"
|
db_engine = "lmdb"
|
||||||
|
|
||||||
|
@ -90,6 +91,7 @@ Top-level configuration options:
|
||||||
[`db_engine`](#db_engine),
|
[`db_engine`](#db_engine),
|
||||||
[`disable_scrub`](#disable_scrub),
|
[`disable_scrub`](#disable_scrub),
|
||||||
[`lmdb_map_size`](#lmdb_map_size),
|
[`lmdb_map_size`](#lmdb_map_size),
|
||||||
|
[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval),
|
||||||
[`metadata_dir`](#metadata_dir),
|
[`metadata_dir`](#metadata_dir),
|
||||||
[`metadata_fsync`](#metadata_fsync),
|
[`metadata_fsync`](#metadata_fsync),
|
||||||
[`replication_mode`](#replication_mode),
|
[`replication_mode`](#replication_mode),
|
||||||
|
@ -346,6 +348,25 @@ at the cost of a moderate drop in write performance.
|
||||||
Similarly to `metatada_fsync`, this is likely not necessary
|
Similarly to `metatada_fsync`, this is likely not necessary
|
||||||
if geographical replication is used.
|
if geographical replication is used.
|
||||||
|
|
||||||
|
#### `metadata_auto_snapshot_interval` (since Garage v0.9.4) {#metadata_auto_snapshot_interval}
|
||||||
|
|
||||||
|
If this value is set, Garage will automatically take a snapshot of the metadata
|
||||||
|
DB file at a regular interval and save it in the metadata directory.
|
||||||
|
This can allow to recover from situations where the metadata DB file is corrupted,
|
||||||
|
for instance after an unclean shutdown.
|
||||||
|
See [this page](@/documentation/operations/recovering.md#corrupted_meta) for details.
|
||||||
|
|
||||||
|
Garage keeps only the two most recent snapshots of the metadata DB and deletes
|
||||||
|
older ones automatically.
|
||||||
|
|
||||||
|
Note that taking a metadata snapshot is a relatively intensive operation as the
|
||||||
|
entire data file is copied. A snapshot being taken might have performance
|
||||||
|
impacts on the Garage node while it is running. If the cluster is under heavy
|
||||||
|
write load when a snapshot operation is running, this might also cause the
|
||||||
|
database file to grow in size significantly as pages cannot be recycled easily.
|
||||||
|
For this reason, it might be better to use filesystem-level snapshots instead
|
||||||
|
if possible.
|
||||||
|
|
||||||
#### `disable_scrub` {#disable_scrub}
|
#### `disable_scrub` {#disable_scrub}
|
||||||
|
|
||||||
By default, Garage runs a scrub of the data directory approximately once per
|
By default, Garage runs a scrub of the data directory approximately once per
|
||||||
|
|
|
@ -17,7 +17,7 @@ hexdump.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
|
||||||
heed = { workspace = true, optional = true }
|
heed = { workspace = true, optional = true }
|
||||||
rusqlite = { workspace = true, optional = true }
|
rusqlite = { workspace = true, optional = true, features = ["backup"] }
|
||||||
sled = { workspace = true, optional = true }
|
sled = { workspace = true, optional = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|
|
@ -19,6 +19,7 @@ use core::ops::{Bound, RangeBounds};
|
||||||
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cell::Cell;
|
use std::cell::Cell;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use err_derive::Error;
|
use err_derive::Error;
|
||||||
|
@ -48,6 +49,12 @@ pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value,
|
||||||
#[error(display = "{}", _0)]
|
#[error(display = "{}", _0)]
|
||||||
pub struct Error(pub Cow<'static, str>);
|
pub struct Error(pub Cow<'static, str>);
|
||||||
|
|
||||||
|
impl From<std::io::Error> for Error {
|
||||||
|
fn from(e: std::io::Error) -> Error {
|
||||||
|
Error(format!("IO: {}", e).into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub type Result<T> = std::result::Result<T, Error>;
|
pub type Result<T> = std::result::Result<T, Error>;
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
|
@ -129,6 +136,10 @@ impl Db {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn snapshot(&self, path: &PathBuf) -> Result<()> {
|
||||||
|
self.0.snapshot(path)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn import(&self, other: &Db) -> Result<()> {
|
pub fn import(&self, other: &Db) -> Result<()> {
|
||||||
let existing_trees = self.list_trees()?;
|
let existing_trees = self.list_trees()?;
|
||||||
if !existing_trees.is_empty() {
|
if !existing_trees.is_empty() {
|
||||||
|
@ -325,6 +336,7 @@ pub(crate) trait IDb: Send + Sync {
|
||||||
fn engine(&self) -> String;
|
fn engine(&self) -> String;
|
||||||
fn open_tree(&self, name: &str) -> Result<usize>;
|
fn open_tree(&self, name: &str) -> Result<usize>;
|
||||||
fn list_trees(&self) -> Result<Vec<String>>;
|
fn list_trees(&self) -> Result<Vec<String>>;
|
||||||
|
fn snapshot(&self, path: &PathBuf) -> Result<()>;
|
||||||
|
|
||||||
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
|
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
|
||||||
fn len(&self, tree: usize) -> Result<usize>;
|
fn len(&self, tree: usize) -> Result<usize>;
|
||||||
|
|
|
@ -3,6 +3,7 @@ use core::ptr::NonNull;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
|
@ -102,6 +103,15 @@ impl IDb for LmdbDb {
|
||||||
Ok(ret2)
|
Ok(ret2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn snapshot(&self, to: &PathBuf) -> Result<()> {
|
||||||
|
std::fs::create_dir_all(to)?;
|
||||||
|
let mut path = to.clone();
|
||||||
|
path.push("data.mdb");
|
||||||
|
self.db
|
||||||
|
.copy_to_path(path, heed::CompactionOption::Disabled)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// ----
|
// ----
|
||||||
|
|
||||||
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
||||||
|
|
|
@ -2,6 +2,7 @@ use core::ops::Bound;
|
||||||
|
|
||||||
use std::cell::Cell;
|
use std::cell::Cell;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
use sled::transaction::{
|
use sled::transaction::{
|
||||||
|
@ -96,6 +97,13 @@ impl IDb for SledDb {
|
||||||
Ok(trees)
|
Ok(trees)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn snapshot(&self, to: &PathBuf) -> Result<()> {
|
||||||
|
let to_db = sled::open(to)?;
|
||||||
|
let export = self.db.export();
|
||||||
|
to_db.import(export);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// ----
|
// ----
|
||||||
|
|
||||||
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
||||||
|
|
|
@ -2,6 +2,7 @@ use core::ops::Bound;
|
||||||
|
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::marker::PhantomPinned;
|
use std::marker::PhantomPinned;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::ptr::NonNull;
|
use std::ptr::NonNull;
|
||||||
use std::sync::{Arc, Mutex, MutexGuard};
|
use std::sync::{Arc, Mutex, MutexGuard};
|
||||||
|
@ -119,6 +120,17 @@ impl IDb for SqliteDb {
|
||||||
Ok(trees)
|
Ok(trees)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn snapshot(&self, to: &PathBuf) -> Result<()> {
|
||||||
|
fn progress(p: rusqlite::backup::Progress) {
|
||||||
|
let percent = (p.pagecount - p.remaining) * 100 / p.pagecount;
|
||||||
|
info!("Sqlite snapshot progres: {}%", percent);
|
||||||
|
}
|
||||||
|
let this = self.0.lock().unwrap();
|
||||||
|
this.db
|
||||||
|
.backup(rusqlite::DatabaseName::Main, to, Some(progress))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// ----
|
// ----
|
||||||
|
|
||||||
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
|
||||||
|
|
|
@ -46,6 +46,7 @@ pub enum AdminRpc {
|
||||||
Stats(StatsOpt),
|
Stats(StatsOpt),
|
||||||
Worker(WorkerOperation),
|
Worker(WorkerOperation),
|
||||||
BlockOperation(BlockOperation),
|
BlockOperation(BlockOperation),
|
||||||
|
MetaOperation(MetaOperation),
|
||||||
|
|
||||||
// Replies
|
// Replies
|
||||||
Ok(String),
|
Ok(String),
|
||||||
|
@ -518,6 +519,44 @@ impl AdminRpcHandler {
|
||||||
)]))
|
)]))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ================ META DB COMMANDS ====================
|
||||||
|
|
||||||
|
async fn handle_meta_cmd(self: &Arc<Self>, mo: &MetaOperation) -> Result<AdminRpc, Error> {
|
||||||
|
match mo {
|
||||||
|
MetaOperation::Snapshot { all: true } => {
|
||||||
|
let ring = self.garage.system.ring.borrow().clone();
|
||||||
|
let to = ring.layout.node_ids().to_vec();
|
||||||
|
|
||||||
|
let resps = futures::future::join_all(to.iter().map(|to| async move {
|
||||||
|
let to = (*to).into();
|
||||||
|
self.endpoint
|
||||||
|
.call(
|
||||||
|
&to,
|
||||||
|
AdminRpc::MetaOperation(MetaOperation::Snapshot { all: false }),
|
||||||
|
PRIO_NORMAL,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let mut ret = vec![];
|
||||||
|
for (to, resp) in to.iter().zip(resps.iter()) {
|
||||||
|
let res_str = match resp {
|
||||||
|
Ok(_) => "ok".to_string(),
|
||||||
|
Err(e) => format!("error: {}", e),
|
||||||
|
};
|
||||||
|
ret.push(format!("{:?}\t{}", to, res_str));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(AdminRpc::Ok(format_table_to_string(ret)))
|
||||||
|
}
|
||||||
|
MetaOperation::Snapshot { all: false } => {
|
||||||
|
garage_model::snapshot::async_snapshot_metadata(&self.garage).await?;
|
||||||
|
Ok(AdminRpc::Ok("Snapshot has been saved.".into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
|
@ -535,6 +574,7 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
|
||||||
AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
|
AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
|
||||||
AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await,
|
AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await,
|
||||||
AdminRpc::BlockOperation(bo) => self.handle_block_cmd(bo).await,
|
AdminRpc::BlockOperation(bo) => self.handle_block_cmd(bo).await,
|
||||||
|
AdminRpc::MetaOperation(mo) => self.handle_meta_cmd(mo).await,
|
||||||
m => Err(GarageError::unexpected_rpc_message(m).into()),
|
m => Err(GarageError::unexpected_rpc_message(m).into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,6 +44,9 @@ pub async fn cli_command_dispatch(
|
||||||
Command::Block(bo) => {
|
Command::Block(bo) => {
|
||||||
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BlockOperation(bo)).await
|
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BlockOperation(bo)).await
|
||||||
}
|
}
|
||||||
|
Command::Meta(mo) => {
|
||||||
|
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::MetaOperation(mo)).await
|
||||||
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,6 +57,10 @@ pub enum Command {
|
||||||
#[structopt(name = "block", version = garage_version())]
|
#[structopt(name = "block", version = garage_version())]
|
||||||
Block(BlockOperation),
|
Block(BlockOperation),
|
||||||
|
|
||||||
|
/// Operations on the metadata db
|
||||||
|
#[structopt(name = "meta", version = garage_version())]
|
||||||
|
Meta(MetaOperation),
|
||||||
|
|
||||||
/// Convert metadata db between database engine formats
|
/// Convert metadata db between database engine formats
|
||||||
#[structopt(name = "convert-db", version = garage_version())]
|
#[structopt(name = "convert-db", version = garage_version())]
|
||||||
ConvertDb(convert_db::ConvertDbOpt),
|
ConvertDb(convert_db::ConvertDbOpt),
|
||||||
|
@ -617,3 +621,14 @@ pub enum BlockOperation {
|
||||||
blocks: Vec<String>,
|
blocks: Vec<String>,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)]
|
||||||
|
pub enum MetaOperation {
|
||||||
|
/// Save a snapshot of the metadata db file
|
||||||
|
#[structopt(name = "snapshot", version = garage_version())]
|
||||||
|
Snapshot {
|
||||||
|
/// Run on all nodes instead of only local node
|
||||||
|
#[structopt(long = "all")]
|
||||||
|
all: bool,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@ pub async fn run_server(config_file: PathBuf, secrets: Secrets) -> Result<(), Er
|
||||||
let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
|
let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
|
||||||
|
|
||||||
info!("Spawning Garage workers...");
|
info!("Spawning Garage workers...");
|
||||||
garage.spawn_workers(&background);
|
garage.spawn_workers(&background)?;
|
||||||
|
|
||||||
if config.admin.trace_sink.is_some() {
|
if config.admin.trace_sink.is_some() {
|
||||||
info!("Initialize tracing...");
|
info!("Initialize tracing...");
|
||||||
|
|
|
@ -28,6 +28,7 @@ chrono.workspace = true
|
||||||
err-derive.workspace = true
|
err-derive.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
base64.workspace = true
|
base64.workspace = true
|
||||||
|
parse_duration.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
zstd.workspace = true
|
zstd.workspace = true
|
||||||
|
|
|
@ -278,7 +278,7 @@ impl Garage {
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
|
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) -> Result<(), Error> {
|
||||||
self.block_manager.spawn_workers(bg);
|
self.block_manager.spawn_workers(bg);
|
||||||
|
|
||||||
self.bucket_table.spawn_workers(bg);
|
self.bucket_table.spawn_workers(bg);
|
||||||
|
@ -299,6 +299,23 @@ impl Garage {
|
||||||
|
|
||||||
#[cfg(feature = "k2v")]
|
#[cfg(feature = "k2v")]
|
||||||
self.k2v.spawn_workers(bg);
|
self.k2v.spawn_workers(bg);
|
||||||
|
|
||||||
|
if let Some(itv) = self.config.metadata_auto_snapshot_interval.as_deref() {
|
||||||
|
let interval = parse_duration::parse(itv)
|
||||||
|
.ok_or_message("Invalid `metadata_auto_snapshot_interval`")?;
|
||||||
|
if interval < std::time::Duration::from_secs(600) {
|
||||||
|
return Err(Error::Message(
|
||||||
|
"metadata_auto_snapshot_interval too small or negative".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
bg.spawn_worker(crate::snapshot::AutoSnapshotWorker::new(
|
||||||
|
self.clone(),
|
||||||
|
interval,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
|
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
|
||||||
|
|
|
@ -19,3 +19,4 @@ pub mod s3;
|
||||||
pub mod garage;
|
pub mod garage;
|
||||||
pub mod helper;
|
pub mod helper;
|
||||||
pub mod migrate;
|
pub mod migrate;
|
||||||
|
pub mod snapshot;
|
||||||
|
|
136
src/model/snapshot.rs
Normal file
136
src/model/snapshot.rs
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
use std::fs;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use rand::prelude::*;
|
||||||
|
use tokio::sync::watch;
|
||||||
|
|
||||||
|
use garage_util::background::*;
|
||||||
|
use garage_util::error::*;
|
||||||
|
|
||||||
|
use crate::garage::Garage;
|
||||||
|
|
||||||
|
// The two most recent snapshots are kept
|
||||||
|
const KEEP_SNAPSHOTS: usize = 2;
|
||||||
|
|
||||||
|
static SNAPSHOT_MUTEX: Mutex<()> = Mutex::new(());
|
||||||
|
|
||||||
|
// ================ snapshotting logic =====================
|
||||||
|
|
||||||
|
/// Run snashot_metadata in a blocking thread and async await on it
|
||||||
|
pub async fn async_snapshot_metadata(garage: &Arc<Garage>) -> Result<(), Error> {
|
||||||
|
let garage = garage.clone();
|
||||||
|
let worker = tokio::task::spawn_blocking(move || snapshot_metadata(&garage));
|
||||||
|
worker.await.unwrap()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Take a snapshot of the metadata database, and erase older
|
||||||
|
/// snapshots if necessary.
|
||||||
|
/// This is not an async function, it should be spawned on a thread pool
|
||||||
|
pub fn snapshot_metadata(garage: &Garage) -> Result<(), Error> {
|
||||||
|
let lock = match SNAPSHOT_MUTEX.try_lock() {
|
||||||
|
Ok(lock) => lock,
|
||||||
|
Err(_) => {
|
||||||
|
return Err(Error::Message(
|
||||||
|
"Cannot acquire lock, another snapshot might be in progress".into(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut snapshots_dir = garage.config.metadata_dir.clone();
|
||||||
|
snapshots_dir.push("snapshots");
|
||||||
|
fs::create_dir_all(&snapshots_dir)?;
|
||||||
|
|
||||||
|
let mut new_path = snapshots_dir.clone();
|
||||||
|
new_path.push(chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true));
|
||||||
|
|
||||||
|
info!("Snapshotting metadata db to {}", new_path.display());
|
||||||
|
garage.db.snapshot(&new_path)?;
|
||||||
|
info!("Metadata db snapshot finished");
|
||||||
|
|
||||||
|
if let Err(e) = cleanup_snapshots(&snapshots_dir) {
|
||||||
|
error!("Failed to do cleanup in snapshots directory: {}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(lock);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cleanup_snapshots(snapshots_dir: &PathBuf) -> Result<(), Error> {
|
||||||
|
let mut snapshots =
|
||||||
|
fs::read_dir(&snapshots_dir)?.collect::<Result<Vec<fs::DirEntry>, std::io::Error>>()?;
|
||||||
|
|
||||||
|
snapshots.retain(|x| x.file_name().len() > 8);
|
||||||
|
snapshots.sort_by_key(|x| x.file_name());
|
||||||
|
|
||||||
|
for to_delete in snapshots.iter().rev().skip(KEEP_SNAPSHOTS) {
|
||||||
|
let path = snapshots_dir.join(to_delete.path());
|
||||||
|
if to_delete.metadata()?.file_type().is_dir() {
|
||||||
|
for file in fs::read_dir(&path)? {
|
||||||
|
let file = file?;
|
||||||
|
if file.metadata()?.is_file() {
|
||||||
|
fs::remove_file(path.join(file.path()))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::fs::remove_dir(&path)?;
|
||||||
|
} else {
|
||||||
|
std::fs::remove_file(&path)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ================ auto snapshot worker =====================
|
||||||
|
|
||||||
|
pub struct AutoSnapshotWorker {
|
||||||
|
garage: Arc<Garage>,
|
||||||
|
next_snapshot: Instant,
|
||||||
|
snapshot_interval: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AutoSnapshotWorker {
|
||||||
|
pub(crate) fn new(garage: Arc<Garage>, snapshot_interval: Duration) -> Self {
|
||||||
|
Self {
|
||||||
|
garage,
|
||||||
|
snapshot_interval,
|
||||||
|
next_snapshot: Instant::now() + (snapshot_interval / 2),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Worker for AutoSnapshotWorker {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
"Metadata snapshot worker".into()
|
||||||
|
}
|
||||||
|
fn status(&self) -> WorkerStatus {
|
||||||
|
WorkerStatus {
|
||||||
|
freeform: vec![format!(
|
||||||
|
"Next snapshot: {}",
|
||||||
|
(chrono::Utc::now() + (self.next_snapshot - Instant::now())).to_rfc3339()
|
||||||
|
)],
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
||||||
|
if Instant::now() < self.next_snapshot {
|
||||||
|
return Ok(WorkerState::Idle);
|
||||||
|
}
|
||||||
|
|
||||||
|
async_snapshot_metadata(&self.garage).await?;
|
||||||
|
|
||||||
|
let rand_factor = 1f32 + thread_rng().gen::<f32>() / 5f32;
|
||||||
|
self.next_snapshot = Instant::now() + self.snapshot_interval.mul_f32(rand_factor);
|
||||||
|
|
||||||
|
Ok(WorkerState::Idle)
|
||||||
|
}
|
||||||
|
async fn wait_for_work(&mut self) -> WorkerState {
|
||||||
|
tokio::time::sleep_until(self.next_snapshot.into()).await;
|
||||||
|
WorkerState::Busy
|
||||||
|
}
|
||||||
|
}
|
|
@ -27,6 +27,10 @@ pub struct Config {
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub disable_scrub: bool,
|
pub disable_scrub: bool,
|
||||||
|
|
||||||
|
/// Automatic snapshot interval for metadata
|
||||||
|
#[serde(default)]
|
||||||
|
pub metadata_auto_snapshot_interval: Option<String>,
|
||||||
|
|
||||||
/// Size of data blocks to save to disk
|
/// Size of data blocks to save to disk
|
||||||
#[serde(
|
#[serde(
|
||||||
deserialize_with = "deserialize_capacity",
|
deserialize_with = "deserialize_capacity",
|
||||||
|
|
Loading…
Reference in a new issue