Make fsync an option for meta and data
continuous-integration/drone/pr Build is failing Details
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Alex 2023-06-09 16:23:21 +02:00
parent 1e466b11eb
commit e7e164a280
4 changed files with 88 additions and 14 deletions

View File

@ -10,6 +10,8 @@ Here is an example `garage.toml` configuration file that illustrates all of the
```toml
metadata_dir = "/var/lib/garage/meta"
data_dir = "/var/lib/garage/data"
metadata_fsync = true
data_fsync = false
db_engine = "lmdb"
@ -124,6 +126,49 @@ convert-db -a <input db engine> -i <input db path> \
Make sure to specify the full database path as presented in the table above,
and not just the path to the metadata directory.
### `metadata_fsync`
Whether to enable synchronous mode for the database engine or not.
This is disabled (`false`) by default.
This reduces the risk of metadata corruption in case of power failures,
at the cost of a significant drop in write performance,
as Garage will have to pause to sync data to disk much more often
(several times for API calls such as PutObject).
Using this option reduces the risk of simultaneous metadata corruption on several
cluster nodes, which could lead to data loss.
If multi-site replication is used, this option is most likely not necessary, as
it is extremely unlikely that two nodes in different locations will have a
power failure at the exact same time.
(Metadata corruption on a single node is not an issue, the corrupted data file
can always be deleted and reconstructed from the other nodes in the cluster.)
Here is how this option impacts the different database engines:
| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` |
|----------|------------------------------------|-------------------------------|
| Sled | default options | *unsupported* |
| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
### `data_fsync`
Whether to `fsync` data blocks and their containing directory after they are
saved to disk.
This is disabled (`false`) by default.
This might reduce the risk that a data block is lost in rare
situations such as simultaneous node losing power,
at the cost of a moderate drop in write performance.
Similarly to `metatada_fsync`, this is likely not necessary
if geographical replication is used.
### `block_size`
Garage splits stored objects in consecutive chunks of size `block_size`

View File

@ -80,6 +80,7 @@ pub struct BlockManager {
/// Directory in which block are stored
pub data_dir: PathBuf,
data_fsync: bool,
compression_level: Option<i32>,
mutation_lock: [Mutex<BlockManagerLocked>; 256],
@ -114,6 +115,7 @@ impl BlockManager {
pub fn new(
db: &db::Db,
data_dir: PathBuf,
data_fsync: bool,
compression_level: Option<i32>,
replication: TableShardedReplication,
system: Arc<System>,
@ -141,6 +143,7 @@ impl BlockManager {
let block_manager = Arc::new(Self {
replication,
data_dir,
data_fsync,
compression_level,
mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
rc,
@ -713,7 +716,11 @@ impl BlockManagerLocked {
let mut f = fs::File::create(&path_tmp).await?;
f.write_all(data).await?;
f.sync_all().await?;
if mgr.data_fsync {
f.sync_all().await?;
}
drop(f);
fs::rename(path_tmp, path).await?;
@ -724,18 +731,20 @@ impl BlockManagerLocked {
fs::remove_file(to_delete).await?;
}
// We want to ensure that when this function returns, data is properly persisted
// to disk. The first step is the sync_all above that does an fsync on the data file.
// Now, we do an fsync on the containing directory, to ensure that the rename
// is persisted properly. See:
// http://thedjbway.b0llix.net/qmail/syncdir.html
let dir = fs::OpenOptions::new()
.read(true)
.mode(0)
.open(directory)
.await?;
dir.sync_all().await?;
drop(dir);
if mgr.data_fsync {
// We want to ensure that when this function returns, data is properly persisted
// to disk. The first step is the sync_all above that does an fsync on the data file.
// Now, we do an fsync on the containing directory, to ensure that the rename
// is persisted properly. See:
// http://thedjbway.b0llix.net/qmail/syncdir.html
let dir = fs::OpenOptions::new()
.read(true)
.mode(0)
.open(directory)
.await?;
dir.sync_all().await?;
drop(dir);
}
Ok(())
}

View File

@ -91,6 +91,11 @@ impl Garage {
// ---- Sled DB ----
#[cfg(feature = "sled")]
"sled" => {
if config.metadata_fsync {
return Err(Error::Message(format!(
"`metadata_fsync = true` is not supported with the Sled database engine"
)));
}
db_path.push("db");
info!("Opening Sled database at: {}", db_path.display());
let db = db::sled_adapter::sled::Config::default()
@ -111,7 +116,11 @@ impl Garage {
let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
.and_then(|db| {
db.pragma_update(None, "journal_mode", &"WAL")?;
db.pragma_update(None, "synchronous", &"NORMAL")?;
if config.metadata_fsync {
db.pragma_update(None, "synchronous", &"NORMAL")?;
} else {
db.pragma_update(None, "synchronous", &"OFF")?;
}
Ok(db)
})
.ok_or_message("Unable to open sqlite DB")?;
@ -139,6 +148,9 @@ impl Garage {
env_builder.map_size(map_size);
unsafe {
env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
if !config.metadata_fsync {
env_builder.flag(heed::flags::Flags::MdbNoSync);
}
}
let db = match env_builder.open(&db_path) {
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
@ -208,6 +220,7 @@ impl Garage {
let block_manager = BlockManager::new(
&db,
config.data_dir.clone(),
config.data_fsync,
config.compression_level,
data_rep_param,
system.clone(),

View File

@ -15,6 +15,13 @@ pub struct Config {
/// Path where to store data. Can be slower, but need higher volume
pub data_dir: PathBuf,
/// Whether to fsync after all metadata transactions (disabled by default)
#[serde(default)]
pub metadata_fsync: bool,
/// Whether to fsync after all data block writes (disabled by default)
#[serde(default)]
pub data_fsync: bool,
/// Size of data blocks to save to disk
#[serde(default = "default_block_size")]
pub block_size: usize,