From e7e164a280dfc1c4adf9d6da6f3b2a9674eca4bd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 9 Jun 2023 16:23:21 +0200 Subject: [PATCH] Make fsync an option for meta and data --- doc/book/reference-manual/configuration.md | 45 ++++++++++++++++++++++ src/block/manager.rs | 35 ++++++++++------- src/model/garage.rs | 15 +++++++- src/util/config.rs | 7 ++++ 4 files changed, 88 insertions(+), 14 deletions(-) diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index 38062bab..de253393 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -10,6 +10,8 @@ Here is an example `garage.toml` configuration file that illustrates all of the ```toml metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" +metadata_fsync = true +data_fsync = false db_engine = "lmdb" @@ -124,6 +126,49 @@ convert-db -a -i \ Make sure to specify the full database path as presented in the table above, and not just the path to the metadata directory. +### `metadata_fsync` + +Whether to enable synchronous mode for the database engine or not. +This is disabled (`false`) by default. + +This reduces the risk of metadata corruption in case of power failures, +at the cost of a significant drop in write performance, +as Garage will have to pause to sync data to disk much more often +(several times for API calls such as PutObject). + +Using this option reduces the risk of simultaneous metadata corruption on several +cluster nodes, which could lead to data loss. + +If multi-site replication is used, this option is most likely not necessary, as +it is extremely unlikely that two nodes in different locations will have a +power failure at the exact same time. + +(Metadata corruption on a single node is not an issue, the corrupted data file +can always be deleted and reconstructed from the other nodes in the cluster.) + +Here is how this option impacts the different database engines: + +| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` | +|----------|------------------------------------|-------------------------------| +| Sled | default options | *unsupported* | +| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` | +| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` | + +Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`). + +### `data_fsync` + +Whether to `fsync` data blocks and their containing directory after they are +saved to disk. +This is disabled (`false`) by default. + +This might reduce the risk that a data block is lost in rare +situations such as simultaneous node losing power, +at the cost of a moderate drop in write performance. + +Similarly to `metatada_fsync`, this is likely not necessary +if geographical replication is used. + ### `block_size` Garage splits stored objects in consecutive chunks of size `block_size` diff --git a/src/block/manager.rs b/src/block/manager.rs index 3ece9a8a..c7e4cd03 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -80,6 +80,7 @@ pub struct BlockManager { /// Directory in which block are stored pub data_dir: PathBuf, + data_fsync: bool, compression_level: Option, mutation_lock: [Mutex; 256], @@ -114,6 +115,7 @@ impl BlockManager { pub fn new( db: &db::Db, data_dir: PathBuf, + data_fsync: bool, compression_level: Option, replication: TableShardedReplication, system: Arc, @@ -141,6 +143,7 @@ impl BlockManager { let block_manager = Arc::new(Self { replication, data_dir, + data_fsync, compression_level, mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())), rc, @@ -713,7 +716,11 @@ impl BlockManagerLocked { let mut f = fs::File::create(&path_tmp).await?; f.write_all(data).await?; - f.sync_all().await?; + + if mgr.data_fsync { + f.sync_all().await?; + } + drop(f); fs::rename(path_tmp, path).await?; @@ -724,18 +731,20 @@ impl BlockManagerLocked { fs::remove_file(to_delete).await?; } - // We want to ensure that when this function returns, data is properly persisted - // to disk. The first step is the sync_all above that does an fsync on the data file. - // Now, we do an fsync on the containing directory, to ensure that the rename - // is persisted properly. See: - // http://thedjbway.b0llix.net/qmail/syncdir.html - let dir = fs::OpenOptions::new() - .read(true) - .mode(0) - .open(directory) - .await?; - dir.sync_all().await?; - drop(dir); + if mgr.data_fsync { + // We want to ensure that when this function returns, data is properly persisted + // to disk. The first step is the sync_all above that does an fsync on the data file. + // Now, we do an fsync on the containing directory, to ensure that the rename + // is persisted properly. See: + // http://thedjbway.b0llix.net/qmail/syncdir.html + let dir = fs::OpenOptions::new() + .read(true) + .mode(0) + .open(directory) + .await?; + dir.sync_all().await?; + drop(dir); + } Ok(()) } diff --git a/src/model/garage.rs b/src/model/garage.rs index 0fbcf334..9b7121db 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -91,6 +91,11 @@ impl Garage { // ---- Sled DB ---- #[cfg(feature = "sled")] "sled" => { + if config.metadata_fsync { + return Err(Error::Message(format!( + "`metadata_fsync = true` is not supported with the Sled database engine" + ))); + } db_path.push("db"); info!("Opening Sled database at: {}", db_path.display()); let db = db::sled_adapter::sled::Config::default() @@ -111,7 +116,11 @@ impl Garage { let db = db::sqlite_adapter::rusqlite::Connection::open(db_path) .and_then(|db| { db.pragma_update(None, "journal_mode", &"WAL")?; - db.pragma_update(None, "synchronous", &"NORMAL")?; + if config.metadata_fsync { + db.pragma_update(None, "synchronous", &"NORMAL")?; + } else { + db.pragma_update(None, "synchronous", &"OFF")?; + } Ok(db) }) .ok_or_message("Unable to open sqlite DB")?; @@ -139,6 +148,9 @@ impl Garage { env_builder.map_size(map_size); unsafe { env_builder.flag(heed::flags::Flags::MdbNoMetaSync); + if !config.metadata_fsync { + env_builder.flag(heed::flags::Flags::MdbNoSync); + } } let db = match env_builder.open(&db_path) { Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { @@ -208,6 +220,7 @@ impl Garage { let block_manager = BlockManager::new( &db, config.data_dir.clone(), + config.data_fsync, config.compression_level, data_rep_param, system.clone(), diff --git a/src/util/config.rs b/src/util/config.rs index 77952356..009f0574 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -15,6 +15,13 @@ pub struct Config { /// Path where to store data. Can be slower, but need higher volume pub data_dir: PathBuf, + /// Whether to fsync after all metadata transactions (disabled by default) + #[serde(default)] + pub metadata_fsync: bool, + /// Whether to fsync after all data block writes (disabled by default) + #[serde(default)] + pub data_fsync: bool, + /// Size of data blocks to save to disk #[serde(default = "default_block_size")] pub block_size: usize,