From 8dff278b7227007d8a2c2f6c6d6f76c0c5d7a1ac Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 14 Mar 2024 17:24:53 +0100 Subject: [PATCH 1/4] [db-snapshot] Implement db snapshotting logic in garage_db --- Cargo.nix | 1 + src/db/Cargo.toml | 2 +- src/db/lib.rs | 12 ++++++++++++ src/db/lmdb_adapter.rs | 10 ++++++++++ src/db/sled_adapter.rs | 8 ++++++++ src/db/sqlite_adapter.rs | 12 ++++++++++++ 6 files changed, 44 insertions(+), 1 deletion(-) diff --git a/Cargo.nix b/Cargo.nix index 0a4ca99a..453d83bb 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -4769,6 +4769,7 @@ in registry = "registry+https://github.com/rust-lang/crates.io-index"; src = fetchCratesIo { inherit name version; sha256 = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"; }; features = builtins.concatLists [ + (lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "backup") (lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "bundled") (lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "modern_sqlite") ]; diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index d7c89620..324de74c 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -17,7 +17,7 @@ hexdump.workspace = true tracing.workspace = true heed = { workspace = true, optional = true } -rusqlite = { workspace = true, optional = true } +rusqlite = { workspace = true, optional = true, features = ["backup"] } sled = { workspace = true, optional = true } [dev-dependencies] diff --git a/src/db/lib.rs b/src/db/lib.rs index 0fb457ce..7f19172f 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -19,6 +19,7 @@ use core::ops::{Bound, RangeBounds}; use std::borrow::Cow; use std::cell::Cell; +use std::path::PathBuf; use std::sync::Arc; use err_derive::Error; @@ -48,6 +49,12 @@ pub type TxValueIter<'a> = Box); +impl From for Error { + fn from(e: std::io::Error) -> Error { + Error(format!("IO: {}", e).into()) + } +} + pub type Result = std::result::Result; #[derive(Debug, Error)] @@ -129,6 +136,10 @@ impl Db { } } + pub fn snapshot(&self, path: &PathBuf) -> Result<()> { + self.0.snapshot(path) + } + pub fn import(&self, other: &Db) -> Result<()> { let existing_trees = self.list_trees()?; if !existing_trees.is_empty() { @@ -325,6 +336,7 @@ pub(crate) trait IDb: Send + Sync { fn engine(&self) -> String; fn open_tree(&self, name: &str) -> Result; fn list_trees(&self) -> Result>; + fn snapshot(&self, path: &PathBuf) -> Result<()>; fn get(&self, tree: usize, key: &[u8]) -> Result>; fn len(&self, tree: usize) -> Result; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 59fa132d..4b131aff 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -3,6 +3,7 @@ use core::ptr::NonNull; use std::collections::HashMap; use std::convert::TryInto; +use std::path::PathBuf; use std::sync::{Arc, RwLock}; use heed::types::ByteSlice; @@ -102,6 +103,15 @@ impl IDb for LmdbDb { Ok(ret2) } + fn snapshot(&self, to: &PathBuf) -> Result<()> { + std::fs::create_dir_all(to)?; + let mut path = to.clone(); + path.push("data.mdb"); + self.db + .copy_to_path(path, heed::CompactionOption::Disabled)?; + Ok(()) + } + // ---- fn get(&self, tree: usize, key: &[u8]) -> Result> { diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs index 84f2001b..c34b4d81 100644 --- a/src/db/sled_adapter.rs +++ b/src/db/sled_adapter.rs @@ -2,6 +2,7 @@ use core::ops::Bound; use std::cell::Cell; use std::collections::HashMap; +use std::path::PathBuf; use std::sync::{Arc, RwLock}; use sled::transaction::{ @@ -96,6 +97,13 @@ impl IDb for SledDb { Ok(trees) } + fn snapshot(&self, to: &PathBuf) -> Result<()> { + let to_db = sled::open(to)?; + let export = self.db.export(); + to_db.import(export); + Ok(()) + } + // ---- fn get(&self, tree: usize, key: &[u8]) -> Result> { diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 9f967c66..827f3cc3 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -2,6 +2,7 @@ use core::ops::Bound; use std::borrow::BorrowMut; use std::marker::PhantomPinned; +use std::path::PathBuf; use std::pin::Pin; use std::ptr::NonNull; use std::sync::{Arc, Mutex, MutexGuard}; @@ -119,6 +120,17 @@ impl IDb for SqliteDb { Ok(trees) } + fn snapshot(&self, to: &PathBuf) -> Result<()> { + fn progress(p: rusqlite::backup::Progress) { + let percent = (p.pagecount - p.remaining) * 100 / p.pagecount; + info!("Sqlite snapshot progres: {}%", percent); + } + let this = self.0.lock().unwrap(); + this.db + .backup(rusqlite::DatabaseName::Main, to, Some(progress))?; + Ok(()) + } + // ---- fn get(&self, tree: usize, key: &[u8]) -> Result> { From 1e42808a59c35aae297d9dfd1c166c09f0b99347 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 14 Mar 2024 18:21:37 +0100 Subject: [PATCH 2/4] [db-snapshot] implement meta_auto_snapshot_interval --- Cargo.lock | 1 + Cargo.nix | 3 +- src/garage/server.rs | 2 +- src/model/Cargo.toml | 1 + src/model/garage.rs | 19 +++++- src/model/lib.rs | 1 + src/model/snapshot.rs | 136 ++++++++++++++++++++++++++++++++++++++++++ src/util/config.rs | 4 ++ 8 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 src/model/snapshot.rs diff --git a/Cargo.lock b/Cargo.lock index f9bf0c0a..573ff5ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1438,6 +1438,7 @@ dependencies = [ "garage_util", "hex", "opentelemetry", + "parse_duration", "rand", "serde", "serde_bytes", diff --git a/Cargo.nix b/Cargo.nix index 453d83bb..a3ac4c90 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -34,7 +34,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "8112e20b0e356bed77a9769600c2b2952662ec8af9548eecf8a2d46fe8433189"; + nixifiedLockHash = "f99156ba9724d370b33258f076f078fefc945f0af79292b1a246bd48bef2a9b2"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -2093,6 +2093,7 @@ in garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; + parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out; rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out; serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out; diff --git a/src/garage/server.rs b/src/garage/server.rs index 6323f957..65bf34db 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -51,7 +51,7 @@ pub async fn run_server(config_file: PathBuf, secrets: Secrets) -> Result<(), Er let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone()); info!("Spawning Garage workers..."); - garage.spawn_workers(&background); + garage.spawn_workers(&background)?; if config.admin.trace_sink.is_some() { info!("Initialize tracing..."); diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 2e5b047d..bde354b5 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -28,6 +28,7 @@ chrono.workspace = true err-derive.workspace = true hex.workspace = true base64.workspace = true +parse_duration.workspace = true tracing.workspace = true rand.workspace = true zstd.workspace = true diff --git a/src/model/garage.rs b/src/model/garage.rs index acf943f6..a6f60546 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -278,7 +278,7 @@ impl Garage { })) } - pub fn spawn_workers(self: &Arc, bg: &BackgroundRunner) { + pub fn spawn_workers(self: &Arc, bg: &BackgroundRunner) -> Result<(), Error> { self.block_manager.spawn_workers(bg); self.bucket_table.spawn_workers(bg); @@ -299,6 +299,23 @@ impl Garage { #[cfg(feature = "k2v")] self.k2v.spawn_workers(bg); + + if let Some(itv) = self.config.metadata_auto_snapshot_interval.as_deref() { + let interval = parse_duration::parse(itv) + .ok_or_message("Invalid `metadata_auto_snapshot_interval`")?; + if interval < std::time::Duration::from_secs(600) { + return Err(Error::Message( + "metadata_auto_snapshot_interval too small or negative".into(), + )); + } + + bg.spawn_worker(crate::snapshot::AutoSnapshotWorker::new( + self.clone(), + interval, + )); + } + + Ok(()) } pub fn bucket_helper(&self) -> helper::bucket::BucketHelper { diff --git a/src/model/lib.rs b/src/model/lib.rs index 4f20ea46..8ec338da 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -19,3 +19,4 @@ pub mod s3; pub mod garage; pub mod helper; pub mod migrate; +pub mod snapshot; diff --git a/src/model/snapshot.rs b/src/model/snapshot.rs new file mode 100644 index 00000000..36f9ec7d --- /dev/null +++ b/src/model/snapshot.rs @@ -0,0 +1,136 @@ +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; +use std::sync::Mutex; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use rand::prelude::*; +use tokio::sync::watch; + +use garage_util::background::*; +use garage_util::error::*; + +use crate::garage::Garage; + +// The two most recent snapshots are kept +const KEEP_SNAPSHOTS: usize = 2; + +static SNAPSHOT_MUTEX: Mutex<()> = Mutex::new(()); + +// ================ snapshotting logic ===================== + +/// Run snashot_metadata in a blocking thread and async await on it +pub async fn async_snapshot_metadata(garage: &Arc) -> Result<(), Error> { + let garage = garage.clone(); + let worker = tokio::task::spawn_blocking(move || snapshot_metadata(&garage)); + worker.await.unwrap()?; + Ok(()) +} + +/// Take a snapshot of the metadata database, and erase older +/// snapshots if necessary. +/// This is not an async function, it should be spawned on a thread pool +pub fn snapshot_metadata(garage: &Garage) -> Result<(), Error> { + let lock = match SNAPSHOT_MUTEX.try_lock() { + Ok(lock) => lock, + Err(_) => { + return Err(Error::Message( + "Cannot acquire lock, another snapshot might be in progress".into(), + )) + } + }; + + let mut snapshots_dir = garage.config.metadata_dir.clone(); + snapshots_dir.push("snapshots"); + fs::create_dir_all(&snapshots_dir)?; + + let mut new_path = snapshots_dir.clone(); + new_path.push(chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true)); + + info!("Snapshotting metadata db to {}", new_path.display()); + garage.db.snapshot(&new_path)?; + info!("Metadata db snapshot finished"); + + if let Err(e) = cleanup_snapshots(&snapshots_dir) { + error!("Failed to do cleanup in snapshots directory: {}", e); + } + + drop(lock); + + Ok(()) +} + +fn cleanup_snapshots(snapshots_dir: &PathBuf) -> Result<(), Error> { + let mut snapshots = + fs::read_dir(&snapshots_dir)?.collect::, std::io::Error>>()?; + + snapshots.retain(|x| x.file_name().len() > 8); + snapshots.sort_by_key(|x| x.file_name()); + + for to_delete in snapshots.iter().rev().skip(KEEP_SNAPSHOTS) { + let path = snapshots_dir.join(to_delete.path()); + if to_delete.metadata()?.file_type().is_dir() { + for file in fs::read_dir(&path)? { + let file = file?; + if file.metadata()?.is_file() { + fs::remove_file(path.join(file.path()))?; + } + } + std::fs::remove_dir(&path)?; + } else { + std::fs::remove_file(&path)?; + } + } + Ok(()) +} + +// ================ auto snapshot worker ===================== + +pub struct AutoSnapshotWorker { + garage: Arc, + next_snapshot: Instant, + snapshot_interval: Duration, +} + +impl AutoSnapshotWorker { + pub(crate) fn new(garage: Arc, snapshot_interval: Duration) -> Self { + Self { + garage, + snapshot_interval, + next_snapshot: Instant::now() + (snapshot_interval / 2), + } + } +} + +#[async_trait] +impl Worker for AutoSnapshotWorker { + fn name(&self) -> String { + "Metadata snapshot worker".into() + } + fn status(&self) -> WorkerStatus { + WorkerStatus { + freeform: vec![format!( + "Next snapshot: {}", + (chrono::Utc::now() + (self.next_snapshot - Instant::now())).to_rfc3339() + )], + ..Default::default() + } + } + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + if Instant::now() < self.next_snapshot { + return Ok(WorkerState::Idle); + } + + async_snapshot_metadata(&self.garage).await?; + + let rand_factor = 1f32 + thread_rng().gen::() / 5f32; + self.next_snapshot = Instant::now() + self.snapshot_interval.mul_f32(rand_factor); + + Ok(WorkerState::Idle) + } + async fn wait_for_work(&mut self) -> WorkerState { + tokio::time::sleep_until(self.next_snapshot.into()).await; + WorkerState::Busy + } +} diff --git a/src/util/config.rs b/src/util/config.rs index 7338a506..8ecbdfbb 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -27,6 +27,10 @@ pub struct Config { #[serde(default)] pub disable_scrub: bool, + /// Automatic snapshot interval for metadata + #[serde(default)] + pub metadata_auto_snapshot_interval: Option, + /// Size of data blocks to save to disk #[serde( deserialize_with = "deserialize_capacity", From a68c37555d15bb19a10f74c7ee85485a5228ab66 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 15 Mar 2024 10:56:57 +0100 Subject: [PATCH 3/4] [db-snapshot] add garage meta snapshot cli operation --- src/garage/admin/mod.rs | 40 +++++++++++++++++++++++++++++++++++++++ src/garage/cli/cmd.rs | 3 +++ src/garage/cli/structs.rs | 15 +++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index b6f9c426..f01ef3d6 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -46,6 +46,7 @@ pub enum AdminRpc { Stats(StatsOpt), Worker(WorkerOperation), BlockOperation(BlockOperation), + MetaOperation(MetaOperation), // Replies Ok(String), @@ -518,6 +519,44 @@ impl AdminRpcHandler { )])) } } + + // ================ META DB COMMANDS ==================== + + async fn handle_meta_cmd(self: &Arc, mo: &MetaOperation) -> Result { + match mo { + MetaOperation::Snapshot { all: true } => { + let ring = self.garage.system.ring.borrow().clone(); + let to = ring.layout.node_ids().to_vec(); + + let resps = futures::future::join_all(to.iter().map(|to| async move { + let to = (*to).into(); + self.endpoint + .call( + &to, + AdminRpc::MetaOperation(MetaOperation::Snapshot { all: false }), + PRIO_NORMAL, + ) + .await + })) + .await; + + let mut ret = vec![]; + for (to, resp) in to.iter().zip(resps.iter()) { + let res_str = match resp { + Ok(_) => "ok".to_string(), + Err(e) => format!("error: {}", e), + }; + ret.push(format!("{:?}\t{}", to, res_str)); + } + + Ok(AdminRpc::Ok(format_table_to_string(ret))) + } + MetaOperation::Snapshot { all: false } => { + garage_model::snapshot::async_snapshot_metadata(&self.garage).await?; + Ok(AdminRpc::Ok("Snapshot has been saved.".into())) + } + } + } } #[async_trait] @@ -535,6 +574,7 @@ impl EndpointHandler for AdminRpcHandler { AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await, AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await, AdminRpc::BlockOperation(bo) => self.handle_block_cmd(bo).await, + AdminRpc::MetaOperation(mo) => self.handle_meta_cmd(mo).await, m => Err(GarageError::unexpected_rpc_message(m).into()), } } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 48359614..4c0a5322 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -44,6 +44,9 @@ pub async fn cli_command_dispatch( Command::Block(bo) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BlockOperation(bo)).await } + Command::Meta(mo) => { + cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::MetaOperation(mo)).await + } _ => unreachable!(), } } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index be4d5bd6..51d2bed3 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -57,6 +57,10 @@ pub enum Command { #[structopt(name = "block", version = garage_version())] Block(BlockOperation), + /// Operations on the metadata db + #[structopt(name = "meta", version = garage_version())] + Meta(MetaOperation), + /// Convert metadata db between database engine formats #[structopt(name = "convert-db", version = garage_version())] ConvertDb(convert_db::ConvertDbOpt), @@ -617,3 +621,14 @@ pub enum BlockOperation { blocks: Vec, }, } + +#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)] +pub enum MetaOperation { + /// Save a snapshot of the metadata db file + #[structopt(name = "snapshot", version = garage_version())] + Snapshot { + /// Run on all nodes instead of only local node + #[structopt(long = "all")] + all: bool, + }, +} From 8cf3d24875d41d79ab08d637cd38d2a5b9e527dd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 15 Mar 2024 13:16:41 +0100 Subject: [PATCH 4/4] [db-snapshot] documentation for metadata db snapshots --- doc/book/cookbook/real-world.md | 16 ++++--- doc/book/operations/durability-repairs.md | 18 ++++++++ doc/book/operations/recovering.md | 54 ++++++++++++++++++++++ doc/book/operations/upgrading.md | 12 +++++ doc/book/reference-manual/configuration.md | 21 +++++++++ 5 files changed, 114 insertions(+), 7 deletions(-) diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index 15a58b9b..9e226030 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -72,13 +72,14 @@ to store 2 TB of data in total. to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md). - For the metadata storage, Garage does not do checksumming and integrity - verification on its own. Users have reported that when using the LMDB - database engine (the default), database files have a tendency of becoming - corrupted after an unclean shutdown (e.g. a power outage), so you should use - a robust filesystem such as BTRFS or ZFS for the metadata partition, and take - regular snapshots so that you can restore to a recent known-good state in - case of an incident. If you cannot do so, you might want to switch to Sqlite - which is more robust. + verification on its own, so it is better to use a robust filesystem such as + BTRFS or ZFS. Users have reported that when using the LMDB database engine + (the default), database files have a tendency of becoming corrupted after an + unclean shutdown (e.g. a power outage), so you should take regular snapshots + to be able to recover from such a situation. This can be done using Garage's + built-in automatic snapshotting (since v0.9.4), or by using filesystem level + snapshots. If you cannot do so, you might want to switch to Sqlite which is + more robust. - LMDB is the fastest and most tested database engine, but it has the following weaknesses: 1/ data files are not architecture-independent, you cannot simply @@ -124,6 +125,7 @@ A valid `/etc/garage.toml` for our cluster would look as follows: metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" db_engine = "lmdb" +metadata_auto_snapshot_interval = "6h" replication_mode = "3" diff --git a/doc/book/operations/durability-repairs.md b/doc/book/operations/durability-repairs.md index f4450dae..c76dc39e 100644 --- a/doc/book/operations/durability-repairs.md +++ b/doc/book/operations/durability-repairs.md @@ -104,6 +104,24 @@ operation will also move out all data from locations marked as read-only. # Metadata operations +## Metadata snapshotting + +It is good practice to setup automatic snapshotting of your metadata database +file, to recover from situations where it becomes corrupted on disk. This can +be done at the filesystem level if you are using ZFS or BTRFS. + +Since Garage v0.9.4, Garage is able to take snapshots of the metadata database +itself. This basically amounts to copying the database file, except that it can +be run live while Garage is running without the risk of corruption or +inconsistencies. This can be setup to run automatically on a schedule using +[`metadata_auto_snapshot_interval`](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval). +A snapshot can also be triggered manually using the `garage meta snapshot` +command. Note that taking a snapshot using this method is very intensive as it +requires making a full copy of the database file, so you might prefer using +filesystem-level snapshots if possible. To recover a corrupted node from such a +snapshot, read the instructions +[here](@/documentation/operations/recovering.md#corrupted_meta). + ## Metadata table resync Garage automatically resyncs all entries stored in the metadata tables every hour, diff --git a/doc/book/operations/recovering.md b/doc/book/operations/recovering.md index 7a830788..6e19db0e 100644 --- a/doc/book/operations/recovering.md +++ b/doc/book/operations/recovering.md @@ -108,3 +108,57 @@ garage layout apply # once satisfied, apply the changes Garage will then start synchronizing all required data on the new node. This process can be monitored using the `garage stats -a` command. + +## Replacement scenario 3: corrupted metadata {#corrupted_meta} + +In some cases, your metadata DB file might become corrupted, for instance if +your node suffered a power outage and did not shut down properly. In this case, +you can recover without having to change the node ID and rebuilding a cluster +layout. This means that data blocks will not need to be shuffled around, you +must simply find a way to repair the metadata file. The best way is generally +to discard the corrupted file and recover it from another source. + +First of all, start by locating the database file in your metadata directory, +which [depends on your `db_engine` +choice](@/documentation/reference-manual/configuration.md#db_engine). Then, +your recovery options are as follows: + +- **Option 1: resyncing from other nodes.** In case your cluster is replicated + with two or three copies, you can simply delete the database file, and Garage + will resync from other nodes. To do so, stop Garage, delete the database file + or directory, and restart Garage. Then, do a full table repair by calling + `garage repair -a --yes tables`. This will take a bit of time to complete as + the new node will need to receive copies of the metadata tables from the + network. + +- **Option 2: restoring a snapshot taken by Garage.** Since v0.9.4, Garage can + [automatically take regular + snapshots](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval) + of your metadata DB file. This file or directory should be located under + `/snapshots`, and is named according to the UTC time at which it + was taken. Stop Garage, discard the database file/directory and replace it by the + snapshot you want to use. For instance, in the case of LMDB: + + ```bash + cd $METADATA_DIR + mv db.lmdb db.lmdb.bak + cp -r snapshots/2024-03-15T12:13:52Z db.lmdb + ``` + + And for Sqlite: + + ```bash + cd $METADATA_DIR + mv db.sqlite db.sqlite.bak + cp snapshots/2024-03-15T12:13:52Z db.sqlite + ``` + + Then, restart Garage and run a full table repair by calling `garage repair -a + --yes tables`. This should run relatively fast as only the changes that + occurred since the snapshot was taken will need to be resynchronized. Of + course, if your cluster is not replicated, you will lose all changes that + occurred since the snapshot was taken. + +- **Option 3: restoring a filesystem-level snapshot.** If you are using ZFS or + BTRFS to snapshot your metadata partition, refer to their specific + documentation on rolling back or copying files from an old snapshot. diff --git a/doc/book/operations/upgrading.md b/doc/book/operations/upgrading.md index 6b6ea26d..c239bfe4 100644 --- a/doc/book/operations/upgrading.md +++ b/doc/book/operations/upgrading.md @@ -73,6 +73,18 @@ The entire procedure would look something like this: You can do all of the nodes in a single zone at once as that won't impact global cluster availability. Do not try to make a backup of the metadata folder of a running node. + **Since Garage v0.9.4,** you can use the `garage meta snapshot --all` command + to take a simultaneous snapshot of the metadata database files of all your + nodes. This avoids the tedious process of having to take them down one by + one before upgrading. Be careful that if automatic snapshotting is enabled, + Garage only keeps the last two snapshots and deletes older ones, so you might + want to disable automatic snapshotting in your upgraded configuration file + until you have confirmed that the upgrade ran successfully. In addition to + snapshotting the metadata databases of your nodes, you should back-up at + least the `cluster_layout` file of one of your Garage instances (this file + should be the same on all nodes and you can copy it safely while Garage is + running). + 3. Prepare your binaries and configuration files for the new Garage version 4. Restart all nodes simultaneously in the new version diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index 8e87b7d8..de800ec0 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -15,6 +15,7 @@ data_dir = "/var/lib/garage/data" metadata_fsync = true data_fsync = false disable_scrub = false +metadata_auto_snapshot_interval = "6h" db_engine = "lmdb" @@ -90,6 +91,7 @@ Top-level configuration options: [`db_engine`](#db_engine), [`disable_scrub`](#disable_scrub), [`lmdb_map_size`](#lmdb_map_size), +[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval), [`metadata_dir`](#metadata_dir), [`metadata_fsync`](#metadata_fsync), [`replication_mode`](#replication_mode), @@ -346,6 +348,25 @@ at the cost of a moderate drop in write performance. Similarly to `metatada_fsync`, this is likely not necessary if geographical replication is used. +#### `metadata_auto_snapshot_interval` (since Garage v0.9.4) {#metadata_auto_snapshot_interval} + +If this value is set, Garage will automatically take a snapshot of the metadata +DB file at a regular interval and save it in the metadata directory. +This can allow to recover from situations where the metadata DB file is corrupted, +for instance after an unclean shutdown. +See [this page](@/documentation/operations/recovering.md#corrupted_meta) for details. + +Garage keeps only the two most recent snapshots of the metadata DB and deletes +older ones automatically. + +Note that taking a metadata snapshot is a relatively intensive operation as the +entire data file is copied. A snapshot being taken might have performance +impacts on the Garage node while it is running. If the cluster is under heavy +write load when a snapshot operation is running, this might also cause the +database file to grow in size significantly as pages cannot be recycled easily. +For this reason, it might be better to use filesystem-level snapshots instead +if possible. + #### `disable_scrub` {#disable_scrub} By default, Garage runs a scrub of the data directory approximately once per