Merge pull request 'metadata db snapshotting' (#775) from db-snapshot into main

Reviewed-on: Deuxfleurs/garage#775
This commit is contained in:
Alex 2024-03-15 13:17:53 +00:00
commit fd2e19bf1b
21 changed files with 380 additions and 11 deletions

1
Cargo.lock generated
View file

@ -1438,6 +1438,7 @@ dependencies = [
"garage_util",
"hex",
"opentelemetry",
"parse_duration",
"rand",
"serde",
"serde_bytes",

View file

@ -34,7 +34,7 @@ args@{
ignoreLockHash,
}:
let
nixifiedLockHash = "8112e20b0e356bed77a9769600c2b2952662ec8af9548eecf8a2d46fe8433189";
nixifiedLockHash = "f99156ba9724d370b33258f076f078fefc945f0af79292b1a246bd48bef2a9b2";
workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
lockHashIgnored = if ignoreLockHash
@ -2093,6 +2093,7 @@ in
garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out;
hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out;
rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out;
serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out;
@ -4769,6 +4770,7 @@ in
registry = "registry+https://github.com/rust-lang/crates.io-index";
src = fetchCratesIo { inherit name version; sha256 = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"; };
features = builtins.concatLists [
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "backup")
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "bundled")
(lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "modern_sqlite")
];

View file

@ -72,13 +72,14 @@ to store 2 TB of data in total.
to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md).
- For the metadata storage, Garage does not do checksumming and integrity
verification on its own. Users have reported that when using the LMDB
database engine (the default), database files have a tendency of becoming
corrupted after an unclean shutdown (e.g. a power outage), so you should use
a robust filesystem such as BTRFS or ZFS for the metadata partition, and take
regular snapshots so that you can restore to a recent known-good state in
case of an incident. If you cannot do so, you might want to switch to Sqlite
which is more robust.
verification on its own, so it is better to use a robust filesystem such as
BTRFS or ZFS. Users have reported that when using the LMDB database engine
(the default), database files have a tendency of becoming corrupted after an
unclean shutdown (e.g. a power outage), so you should take regular snapshots
to be able to recover from such a situation. This can be done using Garage's
built-in automatic snapshotting (since v0.9.4), or by using filesystem level
snapshots. If you cannot do so, you might want to switch to Sqlite which is
more robust.
- LMDB is the fastest and most tested database engine, but it has the following
weaknesses: 1/ data files are not architecture-independent, you cannot simply
@ -124,6 +125,7 @@ A valid `/etc/garage.toml` for our cluster would look as follows:
metadata_dir = "/var/lib/garage/meta"
data_dir = "/var/lib/garage/data"
db_engine = "lmdb"
metadata_auto_snapshot_interval = "6h"
replication_mode = "3"

View file

@ -104,6 +104,24 @@ operation will also move out all data from locations marked as read-only.
# Metadata operations
## Metadata snapshotting
It is good practice to setup automatic snapshotting of your metadata database
file, to recover from situations where it becomes corrupted on disk. This can
be done at the filesystem level if you are using ZFS or BTRFS.
Since Garage v0.9.4, Garage is able to take snapshots of the metadata database
itself. This basically amounts to copying the database file, except that it can
be run live while Garage is running without the risk of corruption or
inconsistencies. This can be setup to run automatically on a schedule using
[`metadata_auto_snapshot_interval`](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval).
A snapshot can also be triggered manually using the `garage meta snapshot`
command. Note that taking a snapshot using this method is very intensive as it
requires making a full copy of the database file, so you might prefer using
filesystem-level snapshots if possible. To recover a corrupted node from such a
snapshot, read the instructions
[here](@/documentation/operations/recovering.md#corrupted_meta).
## Metadata table resync
Garage automatically resyncs all entries stored in the metadata tables every hour,

View file

@ -108,3 +108,57 @@ garage layout apply # once satisfied, apply the changes
Garage will then start synchronizing all required data on the new node.
This process can be monitored using the `garage stats -a` command.
## Replacement scenario 3: corrupted metadata {#corrupted_meta}
In some cases, your metadata DB file might become corrupted, for instance if
your node suffered a power outage and did not shut down properly. In this case,
you can recover without having to change the node ID and rebuilding a cluster
layout. This means that data blocks will not need to be shuffled around, you
must simply find a way to repair the metadata file. The best way is generally
to discard the corrupted file and recover it from another source.
First of all, start by locating the database file in your metadata directory,
which [depends on your `db_engine`
choice](@/documentation/reference-manual/configuration.md#db_engine). Then,
your recovery options are as follows:
- **Option 1: resyncing from other nodes.** In case your cluster is replicated
with two or three copies, you can simply delete the database file, and Garage
will resync from other nodes. To do so, stop Garage, delete the database file
or directory, and restart Garage. Then, do a full table repair by calling
`garage repair -a --yes tables`. This will take a bit of time to complete as
the new node will need to receive copies of the metadata tables from the
network.
- **Option 2: restoring a snapshot taken by Garage.** Since v0.9.4, Garage can
[automatically take regular
snapshots](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval)
of your metadata DB file. This file or directory should be located under
`<metadata_dir>/snapshots`, and is named according to the UTC time at which it
was taken. Stop Garage, discard the database file/directory and replace it by the
snapshot you want to use. For instance, in the case of LMDB:
```bash
cd $METADATA_DIR
mv db.lmdb db.lmdb.bak
cp -r snapshots/2024-03-15T12:13:52Z db.lmdb
```
And for Sqlite:
```bash
cd $METADATA_DIR
mv db.sqlite db.sqlite.bak
cp snapshots/2024-03-15T12:13:52Z db.sqlite
```
Then, restart Garage and run a full table repair by calling `garage repair -a
--yes tables`. This should run relatively fast as only the changes that
occurred since the snapshot was taken will need to be resynchronized. Of
course, if your cluster is not replicated, you will lose all changes that
occurred since the snapshot was taken.
- **Option 3: restoring a filesystem-level snapshot.** If you are using ZFS or
BTRFS to snapshot your metadata partition, refer to their specific
documentation on rolling back or copying files from an old snapshot.

View file

@ -73,6 +73,18 @@ The entire procedure would look something like this:
You can do all of the nodes in a single zone at once as that won't impact global cluster availability.
Do not try to make a backup of the metadata folder of a running node.
**Since Garage v0.9.4,** you can use the `garage meta snapshot --all` command
to take a simultaneous snapshot of the metadata database files of all your
nodes. This avoids the tedious process of having to take them down one by
one before upgrading. Be careful that if automatic snapshotting is enabled,
Garage only keeps the last two snapshots and deletes older ones, so you might
want to disable automatic snapshotting in your upgraded configuration file
until you have confirmed that the upgrade ran successfully. In addition to
snapshotting the metadata databases of your nodes, you should back-up at
least the `cluster_layout` file of one of your Garage instances (this file
should be the same on all nodes and you can copy it safely while Garage is
running).
3. Prepare your binaries and configuration files for the new Garage version
4. Restart all nodes simultaneously in the new version

View file

@ -15,6 +15,7 @@ data_dir = "/var/lib/garage/data"
metadata_fsync = true
data_fsync = false
disable_scrub = false
metadata_auto_snapshot_interval = "6h"
db_engine = "lmdb"
@ -90,6 +91,7 @@ Top-level configuration options:
[`db_engine`](#db_engine),
[`disable_scrub`](#disable_scrub),
[`lmdb_map_size`](#lmdb_map_size),
[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval),
[`metadata_dir`](#metadata_dir),
[`metadata_fsync`](#metadata_fsync),
[`replication_mode`](#replication_mode),
@ -346,6 +348,25 @@ at the cost of a moderate drop in write performance.
Similarly to `metatada_fsync`, this is likely not necessary
if geographical replication is used.
#### `metadata_auto_snapshot_interval` (since Garage v0.9.4) {#metadata_auto_snapshot_interval}
If this value is set, Garage will automatically take a snapshot of the metadata
DB file at a regular interval and save it in the metadata directory.
This can allow to recover from situations where the metadata DB file is corrupted,
for instance after an unclean shutdown.
See [this page](@/documentation/operations/recovering.md#corrupted_meta) for details.
Garage keeps only the two most recent snapshots of the metadata DB and deletes
older ones automatically.
Note that taking a metadata snapshot is a relatively intensive operation as the
entire data file is copied. A snapshot being taken might have performance
impacts on the Garage node while it is running. If the cluster is under heavy
write load when a snapshot operation is running, this might also cause the
database file to grow in size significantly as pages cannot be recycled easily.
For this reason, it might be better to use filesystem-level snapshots instead
if possible.
#### `disable_scrub` {#disable_scrub}
By default, Garage runs a scrub of the data directory approximately once per

View file

@ -17,7 +17,7 @@ hexdump.workspace = true
tracing.workspace = true
heed = { workspace = true, optional = true }
rusqlite = { workspace = true, optional = true }
rusqlite = { workspace = true, optional = true, features = ["backup"] }
sled = { workspace = true, optional = true }
[dev-dependencies]

View file

@ -19,6 +19,7 @@ use core::ops::{Bound, RangeBounds};
use std::borrow::Cow;
use std::cell::Cell;
use std::path::PathBuf;
use std::sync::Arc;
use err_derive::Error;
@ -48,6 +49,12 @@ pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value,
#[error(display = "{}", _0)]
pub struct Error(pub Cow<'static, str>);
impl From<std::io::Error> for Error {
fn from(e: std::io::Error) -> Error {
Error(format!("IO: {}", e).into())
}
}
pub type Result<T> = std::result::Result<T, Error>;
#[derive(Debug, Error)]
@ -129,6 +136,10 @@ impl Db {
}
}
pub fn snapshot(&self, path: &PathBuf) -> Result<()> {
self.0.snapshot(path)
}
pub fn import(&self, other: &Db) -> Result<()> {
let existing_trees = self.list_trees()?;
if !existing_trees.is_empty() {
@ -325,6 +336,7 @@ pub(crate) trait IDb: Send + Sync {
fn engine(&self) -> String;
fn open_tree(&self, name: &str) -> Result<usize>;
fn list_trees(&self) -> Result<Vec<String>>;
fn snapshot(&self, path: &PathBuf) -> Result<()>;
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
fn len(&self, tree: usize) -> Result<usize>;

View file

@ -3,6 +3,7 @@ use core::ptr::NonNull;
use std::collections::HashMap;
use std::convert::TryInto;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use heed::types::ByteSlice;
@ -102,6 +103,15 @@ impl IDb for LmdbDb {
Ok(ret2)
}
fn snapshot(&self, to: &PathBuf) -> Result<()> {
std::fs::create_dir_all(to)?;
let mut path = to.clone();
path.push("data.mdb");
self.db
.copy_to_path(path, heed::CompactionOption::Disabled)?;
Ok(())
}
// ----
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {

View file

@ -2,6 +2,7 @@ use core::ops::Bound;
use std::cell::Cell;
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use sled::transaction::{
@ -96,6 +97,13 @@ impl IDb for SledDb {
Ok(trees)
}
fn snapshot(&self, to: &PathBuf) -> Result<()> {
let to_db = sled::open(to)?;
let export = self.db.export();
to_db.import(export);
Ok(())
}
// ----
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {

View file

@ -2,6 +2,7 @@ use core::ops::Bound;
use std::borrow::BorrowMut;
use std::marker::PhantomPinned;
use std::path::PathBuf;
use std::pin::Pin;
use std::ptr::NonNull;
use std::sync::{Arc, Mutex, MutexGuard};
@ -119,6 +120,17 @@ impl IDb for SqliteDb {
Ok(trees)
}
fn snapshot(&self, to: &PathBuf) -> Result<()> {
fn progress(p: rusqlite::backup::Progress) {
let percent = (p.pagecount - p.remaining) * 100 / p.pagecount;
info!("Sqlite snapshot progres: {}%", percent);
}
let this = self.0.lock().unwrap();
this.db
.backup(rusqlite::DatabaseName::Main, to, Some(progress))?;
Ok(())
}
// ----
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {

View file

@ -46,6 +46,7 @@ pub enum AdminRpc {
Stats(StatsOpt),
Worker(WorkerOperation),
BlockOperation(BlockOperation),
MetaOperation(MetaOperation),
// Replies
Ok(String),
@ -518,6 +519,44 @@ impl AdminRpcHandler {
)]))
}
}
// ================ META DB COMMANDS ====================
async fn handle_meta_cmd(self: &Arc<Self>, mo: &MetaOperation) -> Result<AdminRpc, Error> {
match mo {
MetaOperation::Snapshot { all: true } => {
let ring = self.garage.system.ring.borrow().clone();
let to = ring.layout.node_ids().to_vec();
let resps = futures::future::join_all(to.iter().map(|to| async move {
let to = (*to).into();
self.endpoint
.call(
&to,
AdminRpc::MetaOperation(MetaOperation::Snapshot { all: false }),
PRIO_NORMAL,
)
.await
}))
.await;
let mut ret = vec![];
for (to, resp) in to.iter().zip(resps.iter()) {
let res_str = match resp {
Ok(_) => "ok".to_string(),
Err(e) => format!("error: {}", e),
};
ret.push(format!("{:?}\t{}", to, res_str));
}
Ok(AdminRpc::Ok(format_table_to_string(ret)))
}
MetaOperation::Snapshot { all: false } => {
garage_model::snapshot::async_snapshot_metadata(&self.garage).await?;
Ok(AdminRpc::Ok("Snapshot has been saved.".into()))
}
}
}
}
#[async_trait]
@ -535,6 +574,7 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await,
AdminRpc::BlockOperation(bo) => self.handle_block_cmd(bo).await,
AdminRpc::MetaOperation(mo) => self.handle_meta_cmd(mo).await,
m => Err(GarageError::unexpected_rpc_message(m).into()),
}
}

View file

@ -44,6 +44,9 @@ pub async fn cli_command_dispatch(
Command::Block(bo) => {
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BlockOperation(bo)).await
}
Command::Meta(mo) => {
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::MetaOperation(mo)).await
}
_ => unreachable!(),
}
}

View file

@ -57,6 +57,10 @@ pub enum Command {
#[structopt(name = "block", version = garage_version())]
Block(BlockOperation),
/// Operations on the metadata db
#[structopt(name = "meta", version = garage_version())]
Meta(MetaOperation),
/// Convert metadata db between database engine formats
#[structopt(name = "convert-db", version = garage_version())]
ConvertDb(convert_db::ConvertDbOpt),
@ -617,3 +621,14 @@ pub enum BlockOperation {
blocks: Vec<String>,
},
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)]
pub enum MetaOperation {
/// Save a snapshot of the metadata db file
#[structopt(name = "snapshot", version = garage_version())]
Snapshot {
/// Run on all nodes instead of only local node
#[structopt(long = "all")]
all: bool,
},
}

View file

@ -51,7 +51,7 @@ pub async fn run_server(config_file: PathBuf, secrets: Secrets) -> Result<(), Er
let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
info!("Spawning Garage workers...");
garage.spawn_workers(&background);
garage.spawn_workers(&background)?;
if config.admin.trace_sink.is_some() {
info!("Initialize tracing...");

View file

@ -28,6 +28,7 @@ chrono.workspace = true
err-derive.workspace = true
hex.workspace = true
base64.workspace = true
parse_duration.workspace = true
tracing.workspace = true
rand.workspace = true
zstd.workspace = true

View file

@ -278,7 +278,7 @@ impl Garage {
}))
}
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) -> Result<(), Error> {
self.block_manager.spawn_workers(bg);
self.bucket_table.spawn_workers(bg);
@ -299,6 +299,23 @@ impl Garage {
#[cfg(feature = "k2v")]
self.k2v.spawn_workers(bg);
if let Some(itv) = self.config.metadata_auto_snapshot_interval.as_deref() {
let interval = parse_duration::parse(itv)
.ok_or_message("Invalid `metadata_auto_snapshot_interval`")?;
if interval < std::time::Duration::from_secs(600) {
return Err(Error::Message(
"metadata_auto_snapshot_interval too small or negative".into(),
));
}
bg.spawn_worker(crate::snapshot::AutoSnapshotWorker::new(
self.clone(),
interval,
));
}
Ok(())
}
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {

View file

@ -19,3 +19,4 @@ pub mod s3;
pub mod garage;
pub mod helper;
pub mod migrate;
pub mod snapshot;

136
src/model/snapshot.rs Normal file
View file

@ -0,0 +1,136 @@
use std::fs;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::Mutex;
use std::time::{Duration, Instant};
use async_trait::async_trait;
use rand::prelude::*;
use tokio::sync::watch;
use garage_util::background::*;
use garage_util::error::*;
use crate::garage::Garage;
// The two most recent snapshots are kept
const KEEP_SNAPSHOTS: usize = 2;
static SNAPSHOT_MUTEX: Mutex<()> = Mutex::new(());
// ================ snapshotting logic =====================
/// Run snashot_metadata in a blocking thread and async await on it
pub async fn async_snapshot_metadata(garage: &Arc<Garage>) -> Result<(), Error> {
let garage = garage.clone();
let worker = tokio::task::spawn_blocking(move || snapshot_metadata(&garage));
worker.await.unwrap()?;
Ok(())
}
/// Take a snapshot of the metadata database, and erase older
/// snapshots if necessary.
/// This is not an async function, it should be spawned on a thread pool
pub fn snapshot_metadata(garage: &Garage) -> Result<(), Error> {
let lock = match SNAPSHOT_MUTEX.try_lock() {
Ok(lock) => lock,
Err(_) => {
return Err(Error::Message(
"Cannot acquire lock, another snapshot might be in progress".into(),
))
}
};
let mut snapshots_dir = garage.config.metadata_dir.clone();
snapshots_dir.push("snapshots");
fs::create_dir_all(&snapshots_dir)?;
let mut new_path = snapshots_dir.clone();
new_path.push(chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true));
info!("Snapshotting metadata db to {}", new_path.display());
garage.db.snapshot(&new_path)?;
info!("Metadata db snapshot finished");
if let Err(e) = cleanup_snapshots(&snapshots_dir) {
error!("Failed to do cleanup in snapshots directory: {}", e);
}
drop(lock);
Ok(())
}
fn cleanup_snapshots(snapshots_dir: &PathBuf) -> Result<(), Error> {
let mut snapshots =
fs::read_dir(&snapshots_dir)?.collect::<Result<Vec<fs::DirEntry>, std::io::Error>>()?;
snapshots.retain(|x| x.file_name().len() > 8);
snapshots.sort_by_key(|x| x.file_name());
for to_delete in snapshots.iter().rev().skip(KEEP_SNAPSHOTS) {
let path = snapshots_dir.join(to_delete.path());
if to_delete.metadata()?.file_type().is_dir() {
for file in fs::read_dir(&path)? {
let file = file?;
if file.metadata()?.is_file() {
fs::remove_file(path.join(file.path()))?;
}
}
std::fs::remove_dir(&path)?;
} else {
std::fs::remove_file(&path)?;
}
}
Ok(())
}
// ================ auto snapshot worker =====================
pub struct AutoSnapshotWorker {
garage: Arc<Garage>,
next_snapshot: Instant,
snapshot_interval: Duration,
}
impl AutoSnapshotWorker {
pub(crate) fn new(garage: Arc<Garage>, snapshot_interval: Duration) -> Self {
Self {
garage,
snapshot_interval,
next_snapshot: Instant::now() + (snapshot_interval / 2),
}
}
}
#[async_trait]
impl Worker for AutoSnapshotWorker {
fn name(&self) -> String {
"Metadata snapshot worker".into()
}
fn status(&self) -> WorkerStatus {
WorkerStatus {
freeform: vec![format!(
"Next snapshot: {}",
(chrono::Utc::now() + (self.next_snapshot - Instant::now())).to_rfc3339()
)],
..Default::default()
}
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
if Instant::now() < self.next_snapshot {
return Ok(WorkerState::Idle);
}
async_snapshot_metadata(&self.garage).await?;
let rand_factor = 1f32 + thread_rng().gen::<f32>() / 5f32;
self.next_snapshot = Instant::now() + self.snapshot_interval.mul_f32(rand_factor);
Ok(WorkerState::Idle)
}
async fn wait_for_work(&mut self) -> WorkerState {
tokio::time::sleep_until(self.next_snapshot.into()).await;
WorkerState::Busy
}
}

View file

@ -27,6 +27,10 @@ pub struct Config {
#[serde(default)]
pub disable_scrub: bool,
/// Automatic snapshot interval for metadata
#[serde(default)]
pub metadata_auto_snapshot_interval: Option<String>,
/// Size of data blocks to save to disk
#[serde(
deserialize_with = "deserialize_capacity",