Browse Source
- [x] Design interface - [x] Implement Sled backend - [x] Re-implement the SledCountedTree hack ~~on Sled backend~~ on all backends (i.e. over the abstraction) - [x] Convert Garage code to use generic interface - [x] Proof-read converted Garage code - [ ] Test everything well - [x] Implement sqlite backend - [x] Implement LMDB backend - [ ] (Implement Persy backend?) - [ ] (Implement other backends? (like RocksDB, ...)) - [x] Implement backend choice in config file and garage server module - [x] Add CLI for converting between DB formats - Exploit the new interface to put more things in transactions - [x] `.updated()` trigger on Garage tables Fix #284 **Bugs** - [x] When exporting sqlite, trees iterate empty?? - [x] LMDB doesn't work **Known issues for various back-ends** - Sled: - Eats all my RAM and also all my disk space - `.len()` has to traverse the whole table - Is actually quite slow on some operations - And is actually pretty bad code... - Sqlite: - Requires a lock to be taken on all operations. The lock is also taken when iterating on a table with `.iter()`, and the lock isn't released until the iterator is dropped. This means that we must be VERY carefull to not do anything else inside a `.iter()` loop or else we will have a deadlock! Most such cases have been eliminated from the Garage codebase, but there might still be some that remain. If your Garage-over-Sqlite seems to hang/freeze, this is the reason. - (adapter uses a bunch of unsafe code) - Heed (LMDB): - Not suited for 32-bit machines as it has to map the whole DB in memory. - (adpater uses a tiny bit of unsafe code) **My recommendation:** avoid 32-bit machines and use LMDB as much as possible. **Converting databases** is actually quite easy. For example from Sled to LMDB: ```bash cd src/db cargo run --features cli --bin convert -- -i path/to/garage/meta/db -a sled -o path/to/garage/meta/db.lmdb -b lmdb ``` Then, just add this to your `config.toml`: ```toml db_engine = "lmdb" ``` Co-authored-by: Alex Auvolat <alex@adnab.me> Reviewed-on: #322 Co-authored-by: Alex <alex@adnab.me> Co-committed-by: Alex <alex@adnab.me>pull/327/head
42 changed files with 3090 additions and 645 deletions
File diff suppressed because it is too large
@ -0,0 +1,36 @@ |
|||
[package] |
|||
name = "garage_db" |
|||
version = "0.8.0" |
|||
authors = ["Alex Auvolat <alex@adnab.me>"] |
|||
edition = "2018" |
|||
license = "AGPL-3.0" |
|||
description = "Abstraction over multiple key/value storage engines that supports transactions" |
|||
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" |
|||
readme = "../../README.md" |
|||
|
|||
[lib] |
|||
path = "lib.rs" |
|||
|
|||
[[bin]] |
|||
name = "convert" |
|||
path = "bin/convert.rs" |
|||
required-features = ["cli"] |
|||
|
|||
[dependencies] |
|||
err-derive = "0.3" |
|||
hexdump = "0.1" |
|||
log = "0.4" |
|||
|
|||
heed = "0.11" |
|||
rusqlite = { version = "0.27", features = ["bundled"] } |
|||
sled = "0.34" |
|||
|
|||
# cli deps |
|||
clap = { version = "3.1.18", optional = true, features = ["derive", "env"] } |
|||
pretty_env_logger = { version = "0.4", optional = true } |
|||
|
|||
[dev-dependencies] |
|||
mktemp = "0.4" |
|||
|
|||
[features] |
|||
cli = ["clap", "pretty_env_logger"] |
@ -0,0 +1,76 @@ |
|||
use std::path::PathBuf; |
|||
|
|||
use garage_db::*; |
|||
|
|||
use clap::Parser; |
|||
|
|||
/// K2V command line interface
|
|||
#[derive(Parser, Debug)] |
|||
#[clap(author, version, about, long_about = None)] |
|||
struct Args { |
|||
/// Input DB path
|
|||
#[clap(short = 'i')] |
|||
input_path: PathBuf, |
|||
/// Input DB engine
|
|||
#[clap(short = 'a')] |
|||
input_engine: String, |
|||
|
|||
/// Output DB path
|
|||
#[clap(short = 'o')] |
|||
output_path: PathBuf, |
|||
/// Output DB engine
|
|||
#[clap(short = 'b')] |
|||
output_engine: String, |
|||
} |
|||
|
|||
fn main() { |
|||
let args = Args::parse(); |
|||
pretty_env_logger::init(); |
|||
|
|||
match do_conversion(args) { |
|||
Ok(()) => println!("Success!"), |
|||
Err(e) => eprintln!("Error: {}", e), |
|||
} |
|||
} |
|||
|
|||
fn do_conversion(args: Args) -> Result<()> { |
|||
let input = open_db(args.input_path, args.input_engine)?; |
|||
let output = open_db(args.output_path, args.output_engine)?; |
|||
output.import(&input)?; |
|||
Ok(()) |
|||
} |
|||
|
|||
fn open_db(path: PathBuf, engine: String) -> Result<Db> { |
|||
match engine.as_str() { |
|||
"sled" => { |
|||
let db = sled_adapter::sled::Config::default().path(&path).open()?; |
|||
Ok(sled_adapter::SledDb::init(db)) |
|||
} |
|||
"sqlite" | "sqlite3" | "rusqlite" => { |
|||
let db = sqlite_adapter::rusqlite::Connection::open(&path)?; |
|||
Ok(sqlite_adapter::SqliteDb::init(db)) |
|||
} |
|||
"lmdb" | "heed" => { |
|||
std::fs::create_dir_all(&path).map_err(|e| { |
|||
Error(format!("Unable to create LMDB data directory: {}", e).into()) |
|||
})?; |
|||
|
|||
let map_size = if u32::MAX as usize == usize::MAX { |
|||
eprintln!( |
|||
"LMDB is not recommended on 32-bit systems, database size will be limited" |
|||
); |
|||
1usize << 30 // 1GB for 32-bit systems
|
|||
} else { |
|||
1usize << 40 // 1TB for 64-bit systems
|
|||
}; |
|||
|
|||
let db = lmdb_adapter::heed::EnvOpenOptions::new() |
|||
.max_dbs(100) |
|||
.map_size(map_size) |
|||
.open(&path) |
|||
.unwrap(); |
|||
Ok(lmdb_adapter::LmdbDb::init(db)) |
|||
} |
|||
e => Err(Error(format!("Invalid DB engine: {}", e).into())), |
|||
} |
|||
} |
@ -0,0 +1,127 @@ |
|||
//! This hack allows a db tree to keep in RAM a counter of the number of entries
|
|||
//! it contains, which is used to call .len() on it. This is usefull only for
|
|||
//! the sled backend where .len() otherwise would have to traverse the whole
|
|||
//! tree to count items. For sqlite and lmdb, this is mostly useless (but
|
|||
//! hopefully not harmfull!). Note that a CountedTree cannot be part of a
|
|||
//! transaction.
|
|||
|
|||
use std::sync::{ |
|||
atomic::{AtomicUsize, Ordering}, |
|||
Arc, |
|||
}; |
|||
|
|||
use crate::{Result, Tree, TxError, Value, ValueIter}; |
|||
|
|||
#[derive(Clone)] |
|||
pub struct CountedTree(Arc<CountedTreeInternal>); |
|||
|
|||
struct CountedTreeInternal { |
|||
tree: Tree, |
|||
len: AtomicUsize, |
|||
} |
|||
|
|||
impl CountedTree { |
|||
pub fn new(tree: Tree) -> Result<Self> { |
|||
let len = tree.len()?; |
|||
Ok(Self(Arc::new(CountedTreeInternal { |
|||
tree, |
|||
len: AtomicUsize::new(len), |
|||
}))) |
|||
} |
|||
|
|||
pub fn len(&self) -> usize { |
|||
self.0.len.load(Ordering::SeqCst) |
|||
} |
|||
|
|||
pub fn is_empty(&self) -> bool { |
|||
self.len() == 0 |
|||
} |
|||
|
|||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> { |
|||
self.0.tree.get(key) |
|||
} |
|||
|
|||
pub fn first(&self) -> Result<Option<(Value, Value)>> { |
|||
self.0.tree.first() |
|||
} |
|||
|
|||
pub fn iter(&self) -> Result<ValueIter<'_>> { |
|||
self.0.tree.iter() |
|||
} |
|||
|
|||
// ---- writing functions ----
|
|||
|
|||
pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<Value>> |
|||
where |
|||
K: AsRef<[u8]>, |
|||
V: AsRef<[u8]>, |
|||
{ |
|||
let old_val = self.0.tree.insert(key, value)?; |
|||
if old_val.is_none() { |
|||
self.0.len.fetch_add(1, Ordering::SeqCst); |
|||
} |
|||
Ok(old_val) |
|||
} |
|||
|
|||
pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> { |
|||
let old_val = self.0.tree.remove(key)?; |
|||
if old_val.is_some() { |
|||
self.0.len.fetch_sub(1, Ordering::SeqCst); |
|||
} |
|||
Ok(old_val) |
|||
} |
|||
|
|||
pub fn compare_and_swap<K, OV, NV>( |
|||
&self, |
|||
key: K, |
|||
expected_old: Option<OV>, |
|||
new: Option<NV>, |
|||
) -> Result<bool> |
|||
where |
|||
K: AsRef<[u8]>, |
|||
OV: AsRef<[u8]>, |
|||
NV: AsRef<[u8]>, |
|||
{ |
|||
let old_some = expected_old.is_some(); |
|||
let new_some = new.is_some(); |
|||
|
|||
let tx_res = self.0.tree.db().transaction(|mut tx| { |
|||
let old_val = tx.get(&self.0.tree, &key)?; |
|||
let is_same = match (&old_val, &expected_old) { |
|||
(None, None) => true, |
|||
(Some(x), Some(y)) if x == y.as_ref() => true, |
|||
_ => false, |
|||
}; |
|||
if is_same { |
|||
match &new { |
|||
Some(v) => { |
|||
tx.insert(&self.0.tree, &key, v)?; |
|||
} |
|||
None => { |
|||
tx.remove(&self.0.tree, &key)?; |
|||
} |
|||
} |
|||
tx.commit(()) |
|||
} else { |
|||
tx.abort(()) |
|||
} |
|||
}); |
|||
|
|||
match tx_res { |
|||
Ok(()) => { |
|||
match (old_some, new_some) { |
|||
(false, true) => { |
|||
self.0.len.fetch_add(1, Ordering::SeqCst); |
|||
} |
|||
(true, false) => { |
|||
self.0.len.fetch_sub(1, Ordering::SeqCst); |
|||
} |
|||
_ => (), |
|||
} |
|||
Ok(true) |
|||
} |
|||
Err(TxError::Abort(())) => Ok(false), |
|||
Err(TxError::Db(e)) => Err(e), |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,400 @@ |
|||
pub mod lmdb_adapter; |
|||
pub mod sled_adapter; |
|||
pub mod sqlite_adapter; |
|||
|
|||
pub mod counted_tree_hack; |
|||
|
|||
#[cfg(test)] |
|||
pub mod test; |
|||
|
|||
use core::ops::{Bound, RangeBounds}; |
|||
|
|||
use std::borrow::Cow; |
|||
use std::cell::Cell; |
|||
use std::sync::Arc; |
|||
|
|||
use err_derive::Error; |
|||
|
|||
#[derive(Clone)] |
|||
pub struct Db(pub(crate) Arc<dyn IDb>); |
|||
|
|||
pub struct Transaction<'a>(&'a mut dyn ITx); |
|||
|
|||
#[derive(Clone)] |
|||
pub struct Tree(Arc<dyn IDb>, usize); |
|||
|
|||
pub type Value = Vec<u8>; |
|||
pub type ValueIter<'a> = Box<dyn std::iter::Iterator<Item = Result<(Value, Value)>> + 'a>; |
|||
pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value, Value)>> + 'a>; |
|||
|
|||
// ----
|
|||
|
|||
#[derive(Debug, Error)] |
|||
#[error(display = "{}", _0)] |
|||
pub struct Error(pub Cow<'static, str>); |
|||
|
|||
pub type Result<T> = std::result::Result<T, Error>; |
|||
|
|||
#[derive(Debug, Error)] |
|||
#[error(display = "{}", _0)] |
|||
pub struct TxOpError(pub(crate) Error); |
|||
pub type TxOpResult<T> = std::result::Result<T, TxOpError>; |
|||
|
|||
pub enum TxError<E> { |
|||
Abort(E), |
|||
Db(Error), |
|||
} |
|||
pub type TxResult<R, E> = std::result::Result<R, TxError<E>>; |
|||
|
|||
impl<E> From<TxOpError> for TxError<E> { |
|||
fn from(e: TxOpError) -> TxError<E> { |
|||
TxError::Db(e.0) |
|||
} |
|||
} |
|||
|
|||
pub fn unabort<R, E>(res: TxResult<R, E>) -> TxOpResult<std::result::Result<R, E>> { |
|||
match res { |
|||
Ok(v) => Ok(Ok(v)), |
|||
Err(TxError::Abort(e)) => Ok(Err(e)), |
|||
Err(TxError::Db(e)) => Err(TxOpError(e)), |
|||
} |
|||
} |
|||
|
|||
// ----
|
|||
|
|||
impl Db { |
|||
pub fn engine(&self) -> String { |
|||
self.0.engine() |
|||
} |
|||
|
|||
pub fn open_tree<S: AsRef<str>>(&self, name: S) -> Result<Tree> { |
|||
let tree_id = self.0.open_tree(name.as_ref())?; |
|||
Ok(Tree(self.0.clone(), tree_id)) |
|||
} |
|||
|
|||
pub fn list_trees(&self) -> Result<Vec<String>> { |
|||
self.0.list_trees() |
|||
} |
|||
|
|||
pub fn transaction<R, E, F>(&self, fun: F) -> TxResult<R, E> |
|||
where |
|||
F: Fn(Transaction<'_>) -> TxResult<R, E>, |
|||
{ |
|||
let f = TxFn { |
|||
function: fun, |
|||
result: Cell::new(None), |
|||
}; |
|||
let tx_res = self.0.transaction(&f); |
|||
let ret = f |
|||
.result |
|||
.into_inner() |
|||
.expect("Transaction did not store result"); |
|||
|
|||
match tx_res { |
|||
Ok(()) => { |
|||
assert!(matches!(ret, Ok(_))); |
|||
ret |
|||
} |
|||
Err(TxError::Abort(())) => { |
|||
assert!(matches!(ret, Err(TxError::Abort(_)))); |
|||
ret |
|||
} |
|||
Err(TxError::Db(e2)) => match ret { |
|||
// Ok was stored -> the error occured when finalizing
|
|||
// transaction
|
|||
Ok(_) => Err(TxError::Db(e2)), |
|||
// An error was already stored: that's the one we want to
|
|||
// return
|
|||
Err(TxError::Db(e)) => Err(TxError::Db(e)), |
|||
_ => unreachable!(), |
|||
}, |
|||
} |
|||
} |
|||
|
|||
pub fn import(&self, other: &Db) -> Result<()> { |
|||
let existing_trees = self.list_trees()?; |
|||
if !existing_trees.is_empty() { |
|||
return Err(Error( |
|||
format!( |
|||
"destination database already contains data: {:?}", |
|||
existing_trees |
|||
) |
|||
.into(), |
|||
)); |
|||
} |
|||
|
|||
let tree_names = other.list_trees()?; |
|||
for name in tree_names { |
|||
let tree = self.open_tree(&name)?; |
|||
if tree.len()? > 0 { |
|||
return Err(Error(format!("tree {} already contains data", name).into())); |
|||
} |
|||
|
|||
let ex_tree = other.open_tree(&name)?; |
|||
|
|||
let tx_res = self.transaction(|mut tx| { |
|||
let mut i = 0; |
|||
for item in ex_tree.iter().map_err(TxError::Abort)? { |
|||
let (k, v) = item.map_err(TxError::Abort)?; |
|||
tx.insert(&tree, k, v)?; |
|||
i += 1; |
|||
if i % 1000 == 0 { |
|||
println!("{}: imported {}", name, i); |
|||
} |
|||
} |
|||
tx.commit(i) |
|||
}); |
|||
let total = match tx_res { |
|||
Err(TxError::Db(e)) => return Err(e), |
|||
Err(TxError::Abort(e)) => return Err(e), |
|||
Ok(x) => x, |
|||
}; |
|||
|
|||
println!("{}: finished importing, {} items", name, total); |
|||
} |
|||
Ok(()) |
|||
} |
|||
} |
|||
|
|||
#[allow(clippy::len_without_is_empty)] |
|||
impl Tree { |
|||
#[inline] |
|||
pub fn db(&self) -> Db { |
|||
Db(self.0.clone()) |
|||
} |
|||
|
|||
#[inline] |
|||
pub fn get<T: AsRef<[u8]>>(&self, key: T) -> Result<Option<Value>> { |
|||
self.0.get(self.1, key.as_ref()) |
|||
} |
|||
#[inline] |
|||
pub fn len(&self) -> Result<usize> { |
|||
self.0.len(self.1) |
|||
} |
|||
|
|||
#[inline] |
|||
pub fn first(&self) -> Result<Option<(Value, Value)>> { |
|||
self.iter()?.next().transpose() |
|||
} |
|||
#[inline] |
|||
pub fn get_gt<T: AsRef<[u8]>>(&self, from: T) -> Result<Option<(Value, Value)>> { |
|||
self.range((Bound::Excluded(from), Bound::Unbounded))? |
|||
.next() |
|||
.transpose() |
|||
} |
|||
|
|||
/// Returns the old value if there was one
|
|||
#[inline] |
|||
pub fn insert<T: AsRef<[u8]>, U: AsRef<[u8]>>( |
|||
&self, |
|||
key: T, |
|||
value: U, |
|||
) -> Result<Option<Value>> { |
|||
self.0.insert(self.1, key.as_ref(), value.as_ref()) |
|||
} |
|||
/// Returns the old value if there was one
|
|||
#[inline] |
|||
pub fn remove<T: AsRef<[u8]>>(&self, key: T) -> Result<Option<Value>> { |
|||
self.0.remove(self.1, key.as_ref()) |
|||
} |
|||
|
|||
#[inline] |
|||
pub fn iter(&self) -> Result<ValueIter<'_>> { |
|||
self.0.iter(self.1) |
|||
} |
|||
#[inline] |
|||
pub fn iter_rev(&self) -> Result<ValueIter<'_>> { |
|||