Alex b44d3fc796 Abstract database behind generic interface and implement alternative drivers (#322)
- [x] Design interface
- [x] Implement Sled backend
  - [x] Re-implement the SledCountedTree hack ~~on Sled backend~~ on all backends (i.e. over the abstraction)
- [x] Convert Garage code to use generic interface
- [x] Proof-read converted Garage code
- [ ] Test everything well
- [x] Implement sqlite backend
- [x] Implement LMDB backend
- [ ] (Implement Persy backend?)
- [ ] (Implement other backends? (like RocksDB, ...))
- [x] Implement backend choice in config file and garage server module
- [x] Add CLI for converting between DB formats
- Exploit the new interface to put more things in transactions
  - [x] `.updated()` trigger on Garage tables

Fix #284


- [x] When exporting sqlite, trees iterate empty??
- [x] LMDB doesn't work

**Known issues for various back-ends**

- Sled:
  - Eats all my RAM and also all my disk space
  - `.len()` has to traverse the whole table
  - Is actually quite slow on some operations
  - And is actually pretty bad code...
- Sqlite:
  - Requires a lock to be taken on all operations. The lock is also taken when iterating on a table with `.iter()`, and the lock isn't released until the iterator is dropped. This means that we must be VERY carefull to not do anything else inside a `.iter()` loop or else we will have a deadlock! Most such cases have been eliminated from the Garage codebase, but there might still be some that remain. If your Garage-over-Sqlite seems to hang/freeze, this is the reason.
  - (adapter uses a bunch of unsafe code)
- Heed (LMDB):
  - Not suited for 32-bit machines as it has to map the whole DB in memory.
  - (adpater uses a tiny bit of unsafe code)

**My recommendation:** avoid 32-bit machines and use LMDB as much as possible.

**Converting databases** is actually quite easy. For example from Sled to LMDB:

cd src/db
cargo run --features cli --bin convert -- -i path/to/garage/meta/db -a sled -o path/to/garage/meta/db.lmdb -b lmdb

Then, just add this to your `config.toml`:

db_engine = "lmdb"

Co-authored-by: Alex Auvolat <>
Reviewed-on: Deuxfleurs/garage#322
Co-authored-by: Alex <>
Co-committed-by: Alex <>
2022-06-08 10:01:44 +02:00

304 lines
7.5 KiB

use std::collections::{hash_map, BTreeMap, HashMap};
use std::marker::PhantomData;
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use tokio::sync::{mpsc, watch};
use garage_db as db;
use garage_rpc::ring::Ring;
use garage_rpc::system::System;
use garage_util::data::*;
use garage_util::error::*;
use garage_table::crdt::*;
use garage_table::replication::TableShardedReplication;
use garage_table::*;
pub trait CounterSchema: Clone + PartialEq + Send + Sync + 'static {
const NAME: &'static str;
type P: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
type S: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
/// A counter entry in the global table
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct CounterEntry<T: CounterSchema> {
pub pk: T::P,
pub sk: T::S,
pub values: BTreeMap<String, CounterValue>,
impl<T: CounterSchema> Entry<T::P, T::S> for CounterEntry<T> {
fn partition_key(&self) -> &T::P {
fn sort_key(&self) -> &T::S {
fn is_tombstone(&self) -> bool {
.all(|(_, v)| v.node_values.iter().all(|(_, (_, v))| *v == 0))
impl<T: CounterSchema> CounterEntry<T> {
pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> {
let nodes = &ring.layout.node_id_vec[..];
pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> {
let mut ret = HashMap::new();
for (name, vals) in self.values.iter() {
let new_vals = vals
.filter(|(n, _)| nodes.contains(n))
.map(|(_, (_, v))| *v)
if !new_vals.is_empty() {
new_vals.iter().fold(i64::MIN, |a, b| std::cmp::max(a, *b)),
/// A counter entry in the global table
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct CounterValue {
pub node_values: BTreeMap<Uuid, (u64, i64)>,
impl<T: CounterSchema> Crdt for CounterEntry<T> {
fn merge(&mut self, other: &Self) {
for (name, e2) in other.values.iter() {
if let Some(e) = self.values.get_mut(name) {
} else {
self.values.insert(name.clone(), e2.clone());
impl Crdt for CounterValue {
fn merge(&mut self, other: &Self) {
for (node, (t2, e2)) in other.node_values.iter() {
if let Some((t, e)) = self.node_values.get_mut(node) {
if t2 > t {
*e = *e2;
} else {
self.node_values.insert(*node, (*t2, *e2));
pub struct CounterTable<T: CounterSchema> {
_phantom_t: PhantomData<T>,
impl<T: CounterSchema> TableSchema for CounterTable<T> {
const TABLE_NAME: &'static str = T::NAME;
type P = T::P;
type S = T::S;
type E = CounterEntry<T>;
type Filter = (DeletedFilter, Vec<Uuid>);
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
if filter.0 == DeletedFilter::Any {
return true;
let is_tombstone = entry
.all(|(_, v)| *v == 0);
// ----
pub struct IndexCounter<T: CounterSchema> {
this_node: Uuid,
local_counter: db::Tree,
propagate_tx: mpsc::UnboundedSender<(T::P, T::S, LocalCounterEntry)>,
pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>,
impl<T: CounterSchema> IndexCounter<T> {
pub fn new(
system: Arc<System>,
replication: TableShardedReplication,
db: &db::Db,
) -> Arc<Self> {
let background = system.background.clone();
let (propagate_tx, propagate_rx) = mpsc::unbounded_channel();
let this = Arc::new(Self {
local_counter: db
.open_tree(format!("local_counter:{}", T::NAME))
.expect("Unable to open local counter tree"),
table: Table::new(
CounterTable {
_phantom_t: Default::default(),
let this2 = this.clone();
format!("{} index counter propagator", T::NAME),
move |must_exit| this2.clone().propagate_loop(propagate_rx, must_exit),
pub fn count(
tx: &mut db::Transaction,
pk: &T::P,
sk: &T::S,
counts: &[(&str, i64)],
) -> db::TxResult<(), Error> {
let tree_key =, sk);
let mut entry = match tx.get(&self.local_counter, &tree_key[..])? {
Some(old_bytes) => rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&old_bytes)
None => LocalCounterEntry {
values: BTreeMap::new(),
for (s, inc) in counts.iter() {
let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0));
ent.0 += 1;
ent.1 += *inc;
let new_entry_bytes = rmp_to_vec_all_named(&entry)
tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?;
if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) {
"Could not propagate updated counter values, failed to send to channel: {}",
async fn propagate_loop(
self: Arc<Self>,
mut propagate_rx: mpsc::UnboundedReceiver<(T::P, T::S, LocalCounterEntry)>,
must_exit: watch::Receiver<bool>,
) {
// This loop batches updates to counters to be sent all at once.
// They are sent once the propagate_rx channel has been emptied (or is closed).
let mut buf = HashMap::new();
let mut errors = 0;
loop {
let (ent, closed) = match propagate_rx.try_recv() {
Ok(ent) => (Some(ent), false),
Err(mpsc::error::TryRecvError::Empty) if buf.is_empty() => {
match propagate_rx.recv().await {
Some(ent) => (Some(ent), false),
None => (None, true),
Err(mpsc::error::TryRecvError::Empty) => (None, false),
Err(mpsc::error::TryRecvError::Disconnected) => (None, true),
if let Some((pk, sk, counters)) = ent {
let tree_key =, &sk);
let dist_entry = counters.into_counter_entry::<T>(self.this_node, pk, sk);
match buf.entry(tree_key) {
hash_map::Entry::Vacant(e) => {
hash_map::Entry::Occupied(mut e) => {
// As long as we can add entries, loop back and add them to batch
// before sending batch to other nodes
if !buf.is_empty() {
let entries = buf.iter().map(|(_k, v)| v);
if let Err(e) = self.table.insert_many(entries).await {
errors += 1;
if errors >= 2 && *must_exit.borrow() {
error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::NAME, buf.len(), e);
warn!("({}) Could not propagate {} counter values: {}, retrying in 5 seconds (retry #{})", T::NAME, buf.len(), e, errors);
errors = 0;
if closed || *must_exit.borrow() {
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
struct LocalCounterEntry {
values: BTreeMap<String, (u64, i64)>,
impl LocalCounterEntry {
fn into_counter_entry<T: CounterSchema>(
this_node: Uuid,
pk: T::P,
sk: T::S,
) -> CounterEntry<T> {
CounterEntry {
values: self
.map(|(name, (ts, v))| {
let mut node_values = BTreeMap::new();
node_values.insert(this_node, (ts, v));
(name, CounterValue { node_values })