WIP: Fjall DB engine #906
3 changed files with 56 additions and 38 deletions
|
@ -5,8 +5,8 @@ use std::path::PathBuf;
|
|||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use fjall::{
|
||||
PartitionCreateOptions, PersistMode, TransactionalKeyspace,
|
||||
TransactionalPartitionHandle, WriteTransaction,
|
||||
PartitionCreateOptions, PersistMode, TransactionalKeyspace, TransactionalPartitionHandle,
|
||||
WriteTransaction,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
|
@ -183,13 +183,13 @@ impl IDb for FjallDb {
|
|||
fn iter(&self, tree_idx: usize) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?;
|
||||
let tx = self.keyspace.read_tx();
|
||||
Ok(Box::new(tx.iter(&tree).map(iterator_remap)))
|
||||
Ok(Box::new(tx.iter(&tree).map(iterator_remap)))
|
||||
}
|
||||
|
||||
fn iter_rev(&self, tree_idx: usize) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?;
|
||||
let tx = self.keyspace.read_tx();
|
||||
Ok(Box::new(tx.iter(&tree).rev().map(iterator_remap)))
|
||||
Ok(Box::new(tx.iter(&tree).rev().map(iterator_remap)))
|
||||
}
|
||||
|
||||
fn range<'r>(
|
||||
|
@ -200,7 +200,10 @@ impl IDb for FjallDb {
|
|||
) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?;
|
||||
let tx = self.keyspace.read_tx();
|
||||
Ok(Box::new(tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high)).map(iterator_remap)))
|
||||
Ok(Box::new(
|
||||
tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high))
|
||||
.map(iterator_remap),
|
||||
))
|
||||
}
|
||||
fn range_rev<'r>(
|
||||
&self,
|
||||
|
@ -210,7 +213,11 @@ impl IDb for FjallDb {
|
|||
) -> Result<ValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?;
|
||||
let tx = self.keyspace.read_tx();
|
||||
Ok(Box::new(tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high)).rev().map(iterator_remap)))
|
||||
Ok(Box::new(
|
||||
tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high))
|
||||
.rev()
|
||||
.map(iterator_remap),
|
||||
))
|
||||
}
|
||||
|
||||
// ----
|
||||
|
@ -288,11 +295,11 @@ impl<'a> ITx for FjallTx<'a> {
|
|||
|
||||
fn iter(&self, tree_idx: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?.clone();
|
||||
Ok(Box::new(self.tx.iter(&tree).map(iterator_remap_tx)))
|
||||
Ok(Box::new(self.tx.iter(&tree).map(iterator_remap_tx)))
|
||||
}
|
||||
fn iter_rev(&self, tree_idx: usize) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?.clone();
|
||||
Ok(Box::new(self.tx.iter(&tree).rev().map(iterator_remap_tx)))
|
||||
Ok(Box::new(self.tx.iter(&tree).rev().map(iterator_remap_tx)))
|
||||
}
|
||||
|
||||
fn range<'r>(
|
||||
|
@ -302,9 +309,13 @@ impl<'a> ITx for FjallTx<'a> {
|
|||
high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?;
|
||||
let low = clone_bound(low);
|
||||
let high = clone_bound(high);
|
||||
Ok(Box::new(self.tx.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high)).map(iterator_remap_tx)))
|
||||
let low = clone_bound(low);
|
||||
let high = clone_bound(high);
|
||||
Ok(Box::new(
|
||||
self.tx
|
||||
.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high))
|
||||
.map(iterator_remap_tx),
|
||||
))
|
||||
}
|
||||
fn range_rev<'r>(
|
||||
&self,
|
||||
|
@ -313,22 +324,27 @@ impl<'a> ITx for FjallTx<'a> {
|
|||
high: Bound<&'r [u8]>,
|
||||
) -> TxOpResult<TxValueIter<'_>> {
|
||||
let tree = self.get_tree(tree_idx)?;
|
||||
let low = clone_bound(low);
|
||||
let high = clone_bound(high);
|
||||
Ok(Box::new(self.tx.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high)).rev().map(iterator_remap_tx)))
|
||||
let low = clone_bound(low);
|
||||
let high = clone_bound(high);
|
||||
Ok(Box::new(
|
||||
self.tx
|
||||
.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high))
|
||||
.rev()
|
||||
.map(iterator_remap_tx),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// -- maps fjall's (k, v) to ours
|
||||
|
||||
fn iterator_remap(r: fjall::Result<(fjall::Slice, fjall::Slice)>) -> Result<(Value, Value)> {
|
||||
r.map(|(k, v)| (k.to_vec(), v.to_vec()))
|
||||
.map_err(|e| e.into())
|
||||
r.map(|(k, v)| (k.to_vec(), v.to_vec()))
|
||||
.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
fn iterator_remap_tx(r: fjall::Result<(fjall::Slice, fjall::Slice)>) -> TxOpResult<(Value, Value)> {
|
||||
r.map(|(k, v)| (k.to_vec(), v.to_vec()))
|
||||
.map_err(|e| e.into())
|
||||
r.map(|(k, v)| (k.to_vec(), v.to_vec()))
|
||||
.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// -- utils to deal with Garage's tightness on Bound lifetimes
|
||||
|
@ -337,14 +353,14 @@ type ByteVecBound = Bound<Vec<u8>>;
|
|||
type ByteVecRangeBounds = (ByteVecBound, ByteVecBound);
|
||||
|
||||
fn clone_bound(bound: Bound<&[u8]>) -> ByteVecBound {
|
||||
let value = match bound {
|
||||
Bound::Excluded(v) | Bound::Included(v) => v.to_vec(),
|
||||
Bound::Unbounded => vec!(),
|
||||
};
|
||||
let value = match bound {
|
||||
Bound::Excluded(v) | Bound::Included(v) => v.to_vec(),
|
||||
Bound::Unbounded => vec![],
|
||||
};
|
||||
|
||||
match bound {
|
||||
Bound::Included(_) => Bound::Included(value),
|
||||
Bound::Excluded(_) => Bound::Excluded(value),
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
}
|
||||
match bound {
|
||||
Bound::Included(_) => Bound::Included(value),
|
||||
Bound::Excluded(_) => Bound::Excluded(value),
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use std::convert::TryInto;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use crate::{Db, Error, Result};
|
||||
|
||||
|
@ -56,7 +56,7 @@ impl std::str::FromStr for Engine {
|
|||
pub struct OpenOpt {
|
||||
pub fsync: bool,
|
||||
pub lmdb_map_size: Option<usize>,
|
||||
pub fjall_block_cache_size: Option<usize>,
|
||||
pub fjall_block_cache_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl Default for OpenOpt {
|
||||
|
@ -64,7 +64,7 @@ impl Default for OpenOpt {
|
|||
Self {
|
||||
fsync: false,
|
||||
lmdb_map_size: None,
|
||||
fjall_block_cache_size: None,
|
||||
fjall_block_cache_size: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -125,12 +125,14 @@ pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result<Db> {
|
|||
#[cfg(feature = "fjall")]
|
||||
Engine::Fjall => {
|
||||
info!("Opening Fjall database at: {}", path.display());
|
||||
let fsync_ms = opt.fsync.then(|| 1000 as u16);
|
||||
let mut config = fjall::Config::new(path).fsync_ms(fsync_ms);
|
||||
if let Some(block_cache_size) = opt.fjall_block_cache_size {
|
||||
let block_cache = Arc::new(fjall::BlockCache::with_capacity_bytes(block_cache_size.try_into().unwrap()));
|
||||
config = config.block_cache(block_cache);
|
||||
}
|
||||
let fsync_ms = opt.fsync.then(|| 1000 as u16);
|
||||
|
||||
let mut config = fjall::Config::new(path).fsync_ms(fsync_ms);
|
||||
if let Some(block_cache_size) = opt.fjall_block_cache_size {
|
||||
let block_cache = Arc::new(fjall::BlockCache::with_capacity_bytes(
|
||||
block_cache_size.try_into().unwrap(),
|
||||
));
|
||||
config = config.block_cache(block_cache);
|
||||
}
|
||||
let keyspace = config.open_transactional()?;
|
||||
Ok(crate::fjall_adapter::FjallDb::init(path, keyspace))
|
||||
}
|
||||
|
|
|
@ -115,8 +115,8 @@ pub struct Config {
|
|||
#[serde(deserialize_with = "deserialize_capacity", default)]
|
||||
pub lmdb_map_size: usize,
|
||||
|
||||
/// Fjall block cache size
|
||||
#[serde(deserialize_with = "deserialize_capacity", default)]
|
||||
/// Fjall block cache size
|
||||
#[serde(deserialize_with = "deserialize_capacity", default)]
|
||||
pub fjall_block_cache_size: usize,
|
||||
|
||||
// -- APIs
|
||||
|
|
Loading…
Reference in a new issue
I think the correct implementation of
opt.fsync == false
would be to disable all fsync operations in fjall, in particular settingmanual_journal_persist
totrue
so that transactions would not do an fsync call. This is the meaning of that option for other db engines. Even withopt.fsync == false
we can set fsync_ms to something reasonable like 1000, because if i understand correctly, the fsyncs will now be done by background threads at a regular interval and will not interfere with interactive operations. @marvinj97 please correct me if I'm wrong.By default, every write operation (such as a WriteTx.commit) flushes to OS buffers, but not to disk. This is the same behaviour as RocksDB, and gives you crash safety, but not power loss/kernel panic safety.
manual_journal_persist
skips flushing to OS buffers, so all the data is kept in the user-space BufWriter (unless it is full or you callKeyspace::persist
or set aWriteTransaction::durability
level), so you can lose data if the application cannot unwind properly (e.g. it is killed).