forked from Deuxfleurs/garage
Make fsync an option for meta and data
This commit is contained in:
parent
1e466b11eb
commit
e7e164a280
4 changed files with 88 additions and 14 deletions
|
@ -10,6 +10,8 @@ Here is an example `garage.toml` configuration file that illustrates all of the
|
||||||
```toml
|
```toml
|
||||||
metadata_dir = "/var/lib/garage/meta"
|
metadata_dir = "/var/lib/garage/meta"
|
||||||
data_dir = "/var/lib/garage/data"
|
data_dir = "/var/lib/garage/data"
|
||||||
|
metadata_fsync = true
|
||||||
|
data_fsync = false
|
||||||
|
|
||||||
db_engine = "lmdb"
|
db_engine = "lmdb"
|
||||||
|
|
||||||
|
@ -124,6 +126,49 @@ convert-db -a <input db engine> -i <input db path> \
|
||||||
Make sure to specify the full database path as presented in the table above,
|
Make sure to specify the full database path as presented in the table above,
|
||||||
and not just the path to the metadata directory.
|
and not just the path to the metadata directory.
|
||||||
|
|
||||||
|
### `metadata_fsync`
|
||||||
|
|
||||||
|
Whether to enable synchronous mode for the database engine or not.
|
||||||
|
This is disabled (`false`) by default.
|
||||||
|
|
||||||
|
This reduces the risk of metadata corruption in case of power failures,
|
||||||
|
at the cost of a significant drop in write performance,
|
||||||
|
as Garage will have to pause to sync data to disk much more often
|
||||||
|
(several times for API calls such as PutObject).
|
||||||
|
|
||||||
|
Using this option reduces the risk of simultaneous metadata corruption on several
|
||||||
|
cluster nodes, which could lead to data loss.
|
||||||
|
|
||||||
|
If multi-site replication is used, this option is most likely not necessary, as
|
||||||
|
it is extremely unlikely that two nodes in different locations will have a
|
||||||
|
power failure at the exact same time.
|
||||||
|
|
||||||
|
(Metadata corruption on a single node is not an issue, the corrupted data file
|
||||||
|
can always be deleted and reconstructed from the other nodes in the cluster.)
|
||||||
|
|
||||||
|
Here is how this option impacts the different database engines:
|
||||||
|
|
||||||
|
| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` |
|
||||||
|
|----------|------------------------------------|-------------------------------|
|
||||||
|
| Sled | default options | *unsupported* |
|
||||||
|
| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
|
||||||
|
| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
|
||||||
|
|
||||||
|
Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
|
||||||
|
|
||||||
|
### `data_fsync`
|
||||||
|
|
||||||
|
Whether to `fsync` data blocks and their containing directory after they are
|
||||||
|
saved to disk.
|
||||||
|
This is disabled (`false`) by default.
|
||||||
|
|
||||||
|
This might reduce the risk that a data block is lost in rare
|
||||||
|
situations such as simultaneous node losing power,
|
||||||
|
at the cost of a moderate drop in write performance.
|
||||||
|
|
||||||
|
Similarly to `metatada_fsync`, this is likely not necessary
|
||||||
|
if geographical replication is used.
|
||||||
|
|
||||||
### `block_size`
|
### `block_size`
|
||||||
|
|
||||||
Garage splits stored objects in consecutive chunks of size `block_size`
|
Garage splits stored objects in consecutive chunks of size `block_size`
|
||||||
|
|
|
@ -80,6 +80,7 @@ pub struct BlockManager {
|
||||||
/// Directory in which block are stored
|
/// Directory in which block are stored
|
||||||
pub data_dir: PathBuf,
|
pub data_dir: PathBuf,
|
||||||
|
|
||||||
|
data_fsync: bool,
|
||||||
compression_level: Option<i32>,
|
compression_level: Option<i32>,
|
||||||
|
|
||||||
mutation_lock: [Mutex<BlockManagerLocked>; 256],
|
mutation_lock: [Mutex<BlockManagerLocked>; 256],
|
||||||
|
@ -114,6 +115,7 @@ impl BlockManager {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
db: &db::Db,
|
db: &db::Db,
|
||||||
data_dir: PathBuf,
|
data_dir: PathBuf,
|
||||||
|
data_fsync: bool,
|
||||||
compression_level: Option<i32>,
|
compression_level: Option<i32>,
|
||||||
replication: TableShardedReplication,
|
replication: TableShardedReplication,
|
||||||
system: Arc<System>,
|
system: Arc<System>,
|
||||||
|
@ -141,6 +143,7 @@ impl BlockManager {
|
||||||
let block_manager = Arc::new(Self {
|
let block_manager = Arc::new(Self {
|
||||||
replication,
|
replication,
|
||||||
data_dir,
|
data_dir,
|
||||||
|
data_fsync,
|
||||||
compression_level,
|
compression_level,
|
||||||
mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
|
mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
|
||||||
rc,
|
rc,
|
||||||
|
@ -713,7 +716,11 @@ impl BlockManagerLocked {
|
||||||
|
|
||||||
let mut f = fs::File::create(&path_tmp).await?;
|
let mut f = fs::File::create(&path_tmp).await?;
|
||||||
f.write_all(data).await?;
|
f.write_all(data).await?;
|
||||||
f.sync_all().await?;
|
|
||||||
|
if mgr.data_fsync {
|
||||||
|
f.sync_all().await?;
|
||||||
|
}
|
||||||
|
|
||||||
drop(f);
|
drop(f);
|
||||||
|
|
||||||
fs::rename(path_tmp, path).await?;
|
fs::rename(path_tmp, path).await?;
|
||||||
|
@ -724,18 +731,20 @@ impl BlockManagerLocked {
|
||||||
fs::remove_file(to_delete).await?;
|
fs::remove_file(to_delete).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We want to ensure that when this function returns, data is properly persisted
|
if mgr.data_fsync {
|
||||||
// to disk. The first step is the sync_all above that does an fsync on the data file.
|
// We want to ensure that when this function returns, data is properly persisted
|
||||||
// Now, we do an fsync on the containing directory, to ensure that the rename
|
// to disk. The first step is the sync_all above that does an fsync on the data file.
|
||||||
// is persisted properly. See:
|
// Now, we do an fsync on the containing directory, to ensure that the rename
|
||||||
// http://thedjbway.b0llix.net/qmail/syncdir.html
|
// is persisted properly. See:
|
||||||
let dir = fs::OpenOptions::new()
|
// http://thedjbway.b0llix.net/qmail/syncdir.html
|
||||||
.read(true)
|
let dir = fs::OpenOptions::new()
|
||||||
.mode(0)
|
.read(true)
|
||||||
.open(directory)
|
.mode(0)
|
||||||
.await?;
|
.open(directory)
|
||||||
dir.sync_all().await?;
|
.await?;
|
||||||
drop(dir);
|
dir.sync_all().await?;
|
||||||
|
drop(dir);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,6 +91,11 @@ impl Garage {
|
||||||
// ---- Sled DB ----
|
// ---- Sled DB ----
|
||||||
#[cfg(feature = "sled")]
|
#[cfg(feature = "sled")]
|
||||||
"sled" => {
|
"sled" => {
|
||||||
|
if config.metadata_fsync {
|
||||||
|
return Err(Error::Message(format!(
|
||||||
|
"`metadata_fsync = true` is not supported with the Sled database engine"
|
||||||
|
)));
|
||||||
|
}
|
||||||
db_path.push("db");
|
db_path.push("db");
|
||||||
info!("Opening Sled database at: {}", db_path.display());
|
info!("Opening Sled database at: {}", db_path.display());
|
||||||
let db = db::sled_adapter::sled::Config::default()
|
let db = db::sled_adapter::sled::Config::default()
|
||||||
|
@ -111,7 +116,11 @@ impl Garage {
|
||||||
let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
|
let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
|
||||||
.and_then(|db| {
|
.and_then(|db| {
|
||||||
db.pragma_update(None, "journal_mode", &"WAL")?;
|
db.pragma_update(None, "journal_mode", &"WAL")?;
|
||||||
db.pragma_update(None, "synchronous", &"NORMAL")?;
|
if config.metadata_fsync {
|
||||||
|
db.pragma_update(None, "synchronous", &"NORMAL")?;
|
||||||
|
} else {
|
||||||
|
db.pragma_update(None, "synchronous", &"OFF")?;
|
||||||
|
}
|
||||||
Ok(db)
|
Ok(db)
|
||||||
})
|
})
|
||||||
.ok_or_message("Unable to open sqlite DB")?;
|
.ok_or_message("Unable to open sqlite DB")?;
|
||||||
|
@ -139,6 +148,9 @@ impl Garage {
|
||||||
env_builder.map_size(map_size);
|
env_builder.map_size(map_size);
|
||||||
unsafe {
|
unsafe {
|
||||||
env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
|
env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
|
||||||
|
if !config.metadata_fsync {
|
||||||
|
env_builder.flag(heed::flags::Flags::MdbNoSync);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
let db = match env_builder.open(&db_path) {
|
let db = match env_builder.open(&db_path) {
|
||||||
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
|
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
|
||||||
|
@ -208,6 +220,7 @@ impl Garage {
|
||||||
let block_manager = BlockManager::new(
|
let block_manager = BlockManager::new(
|
||||||
&db,
|
&db,
|
||||||
config.data_dir.clone(),
|
config.data_dir.clone(),
|
||||||
|
config.data_fsync,
|
||||||
config.compression_level,
|
config.compression_level,
|
||||||
data_rep_param,
|
data_rep_param,
|
||||||
system.clone(),
|
system.clone(),
|
||||||
|
|
|
@ -15,6 +15,13 @@ pub struct Config {
|
||||||
/// Path where to store data. Can be slower, but need higher volume
|
/// Path where to store data. Can be slower, but need higher volume
|
||||||
pub data_dir: PathBuf,
|
pub data_dir: PathBuf,
|
||||||
|
|
||||||
|
/// Whether to fsync after all metadata transactions (disabled by default)
|
||||||
|
#[serde(default)]
|
||||||
|
pub metadata_fsync: bool,
|
||||||
|
/// Whether to fsync after all data block writes (disabled by default)
|
||||||
|
#[serde(default)]
|
||||||
|
pub data_fsync: bool,
|
||||||
|
|
||||||
/// Size of data blocks to save to disk
|
/// Size of data blocks to save to disk
|
||||||
#[serde(default = "default_block_size")]
|
#[serde(default = "default_block_size")]
|
||||||
pub block_size: usize,
|
pub block_size: usize,
|
||||||
|
|
Loading…
Reference in a new issue