WIP: Fjall DB engine #906

Draft
withings wants to merge 5 commits from withings/garage:feat/fjall-db-engine into main
3 changed files with 56 additions and 38 deletions
Showing only changes of commit 04d3847200 - Show all commits

View file

@ -5,8 +5,8 @@ use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use fjall::{
PartitionCreateOptions, PersistMode, TransactionalKeyspace,
TransactionalPartitionHandle, WriteTransaction,
PartitionCreateOptions, PersistMode, TransactionalKeyspace, TransactionalPartitionHandle,
WriteTransaction,
};
use crate::{
@ -200,7 +200,10 @@ impl IDb for FjallDb {
) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(Box::new(tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high)).map(iterator_remap)))
Ok(Box::new(
tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high))
.map(iterator_remap),
))
}
fn range_rev<'r>(
&self,
@ -210,7 +213,11 @@ impl IDb for FjallDb {
) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(Box::new(tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high)).rev().map(iterator_remap)))
Ok(Box::new(
tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high))
.rev()
.map(iterator_remap),
))
}
// ----
@ -304,7 +311,11 @@ impl<'a> ITx for FjallTx<'a> {
let tree = self.get_tree(tree_idx)?;
let low = clone_bound(low);
let high = clone_bound(high);
Ok(Box::new(self.tx.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high)).map(iterator_remap_tx)))
Ok(Box::new(
self.tx
.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high))
.map(iterator_remap_tx),
))
}
fn range_rev<'r>(
&self,
@ -315,7 +326,12 @@ impl<'a> ITx for FjallTx<'a> {
let tree = self.get_tree(tree_idx)?;
let low = clone_bound(low);
let high = clone_bound(high);
Ok(Box::new(self.tx.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high)).rev().map(iterator_remap_tx)))
Ok(Box::new(
self.tx
.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high))
.rev()
.map(iterator_remap_tx),
))
}
}
@ -339,7 +355,7 @@ type ByteVecRangeBounds = (ByteVecBound, ByteVecBound);
fn clone_bound(bound: Bound<&[u8]>) -> ByteVecBound {
let value = match bound {
Bound::Excluded(v) | Bound::Included(v) => v.to_vec(),
Bound::Unbounded => vec!(),
Bound::Unbounded => vec![],
};
match bound {

View file

@ -1,6 +1,6 @@
use std::convert::TryInto;
use std::path::PathBuf;
use std::sync::Arc;
use std::convert::TryInto;
use crate::{Db, Error, Result};
@ -128,7 +128,9 @@ pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result<Db> {
let fsync_ms = opt.fsync.then(|| 1000 as u16);
Outdated
Review

I think the correct implementation of opt.fsync == false would be to disable all fsync operations in fjall, in particular setting manual_journal_persist to true so that transactions would not do an fsync call. This is the meaning of that option for other db engines. Even with opt.fsync == false we can set fsync_ms to something reasonable like 1000, because if i understand correctly, the fsyncs will now be done by background threads at a regular interval and will not interfere with interactive operations. @marvinj97 please correct me if I'm wrong.

I think the correct implementation of `opt.fsync == false` would be to disable all fsync operations in fjall, in particular setting `manual_journal_persist` to `true` so that transactions would not do an fsync call. This is the meaning of that option for other db engines. Even with `opt.fsync == false` we can set fsync_ms to something reasonable like 1000, because if i understand correctly, the fsyncs will now be done by background threads at a regular interval and will not interfere with interactive operations. @marvinj97 please correct me if I'm wrong.

By default, every write operation (such as a WriteTx.commit) flushes to OS buffers, but not to disk. This is the same behaviour as RocksDB, and gives you crash safety, but not power loss/kernel panic safety.
manual_journal_persist skips flushing to OS buffers, so all the data is kept in the user-space BufWriter (unless it is full or you call Keyspace::persist or set a WriteTransaction::durability level), so you can lose data if the application cannot unwind properly (e.g. it is killed).

By default, every write operation (such as a WriteTx.commit) flushes to _OS buffers_, but **not** to disk. This is the same behaviour as RocksDB, and gives you crash safety, but not power loss/kernel panic safety. `manual_journal_persist` skips flushing to OS buffers, so all the data is kept in the user-space BufWriter (unless it is full or you call `Keyspace::persist` or set a `WriteTransaction::durability` level), so you can lose data if the application cannot unwind properly (e.g. it is killed).
let mut config = fjall::Config::new(path).fsync_ms(fsync_ms);
if let Some(block_cache_size) = opt.fjall_block_cache_size {
let block_cache = Arc::new(fjall::BlockCache::with_capacity_bytes(block_cache_size.try_into().unwrap()));
let block_cache = Arc::new(fjall::BlockCache::with_capacity_bytes(
block_cache_size.try_into().unwrap(),
));
config = config.block_cache(block_cache);
}
let keyspace = config.open_transactional()?;