garage/src/table/table.rs

459 lines
12 KiB
Rust
Raw Normal View History

use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use std::time::Duration;
2020-04-08 20:00:41 +00:00
use log::warn;
use arc_swap::ArcSwapOption;
use futures::stream::*;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
2020-04-24 10:10:01 +00:00
use garage_util::data::*;
use garage_util::error::Error;
2020-04-23 17:05:46 +00:00
2021-02-21 12:11:10 +00:00
use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
2020-04-24 10:10:01 +00:00
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
2020-04-23 17:05:46 +00:00
2020-07-08 14:10:53 +00:00
use crate::schema::*;
2020-07-08 15:34:37 +00:00
use crate::table_sync::*;
2020-04-08 20:00:41 +00:00
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
pub struct Table<F: TableSchema, R: TableReplication> {
2020-04-08 21:01:49 +00:00
pub instance: F,
pub replication: R,
2020-04-08 20:00:41 +00:00
pub name: String,
pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
2020-04-08 20:00:41 +00:00
pub system: Arc<System>,
pub store: sled::Tree,
pub syncer: ArcSwapOption<TableSyncer<F, R>>,
2020-04-08 20:00:41 +00:00
}
2020-04-08 21:01:49 +00:00
#[derive(Serialize, Deserialize)]
pub(crate) enum TableRPC<F: TableSchema> {
2020-04-08 21:01:49 +00:00
Ok,
2020-04-08 21:47:34 +00:00
2020-04-09 14:16:27 +00:00
ReadEntry(F::P, F::S),
ReadEntryResponse(Option<ByteBuf>),
2020-04-08 21:47:34 +00:00
// Read range: read all keys in partition P, possibly starting at a certain sort key offset
ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize),
Update(Vec<Arc<ByteBuf>>),
2020-04-16 16:41:10 +00:00
SyncRPC(SyncRPC),
2020-04-08 20:00:41 +00:00
}
2020-04-18 17:21:34 +00:00
impl<F: TableSchema> RpcMessage for TableRPC<F> {}
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn write_quorum(&self, system: &System) -> usize;
fn max_write_errors(&self) -> usize;
// Which are the nodes that do actually replicate the data
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
}
impl<F, R> Table<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
// =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) ===============
2020-04-16 12:50:49 +00:00
pub async fn new(
instance: F,
replication: R,
system: Arc<System>,
db: &sled::Db,
name: String,
2020-04-18 17:21:34 +00:00
rpc_server: &mut RpcServer,
2020-04-12 20:24:53 +00:00
) -> Arc<Self> {
let store = db.open_tree(&name).expect("Unable to open DB tree");
2020-04-18 17:21:34 +00:00
let rpc_path = format!("table_{}", name);
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
2020-04-16 12:50:49 +00:00
let table = Arc::new(Self {
2020-04-08 21:01:49 +00:00
instance,
replication,
2020-04-08 20:00:41 +00:00
name,
2020-04-18 17:21:34 +00:00
rpc_client,
2020-04-08 20:00:41 +00:00
system,
store,
2020-04-17 16:27:29 +00:00
syncer: ArcSwapOption::from(None),
2020-04-16 12:50:49 +00:00
});
2020-04-18 17:21:34 +00:00
table.clone().register_handler(rpc_server, rpc_path);
2020-04-16 16:41:10 +00:00
let syncer = TableSyncer::launch(table.clone()).await;
2020-04-17 16:27:29 +00:00
table.syncer.swap(Some(syncer));
2020-04-18 17:21:34 +00:00
2020-04-16 12:50:49 +00:00
table
2020-04-08 20:00:41 +00:00
}
2020-04-09 14:16:27 +00:00
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system);
2020-04-17 19:59:07 +00:00
//eprintln!("insert who: {:?}", who);
2020-04-08 21:01:49 +00:00
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
2020-04-18 17:21:34 +00:00
let rpc = TableRPC::<F>::Update(vec![e_enc]);
2020-04-18 17:21:34 +00:00
self.rpc_client
.try_call_many(
&who[..],
rpc,
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system))
.with_timeout(TABLE_RPC_TIMEOUT),
)
.await?;
2020-04-08 21:01:49 +00:00
Ok(())
}
2020-04-08 20:00:41 +00:00
pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> {
let mut call_list = HashMap::new();
for entry in entries.iter() {
let hash = entry.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
for node in who {
if !call_list.contains_key(&node) {
call_list.insert(node, vec![]);
}
call_list.get_mut(&node).unwrap().push(e_enc.clone());
}
}
let call_futures = call_list.drain().map(|(node, entries)| async move {
let rpc = TableRPC::<F>::Update(entries);
let resp = self.rpc_client.call(node, rpc, TABLE_RPC_TIMEOUT).await?;
Ok::<_, Error>((node, resp))
});
let mut resps = call_futures.collect::<FuturesUnordered<_>>();
let mut errors = vec![];
while let Some(resp) = resps.next().await {
if let Err(e) = resp {
errors.push(e);
}
}
if errors.len() > self.replication.max_write_errors() {
Err(Error::Message("Too many errors".into()))
} else {
Ok(())
}
}
2020-04-16 12:50:49 +00:00
pub async fn get(
self: &Arc<Self>,
partition_key: &F::P,
sort_key: &F::S,
) -> Result<Option<F::E>, Error> {
2020-04-09 14:16:27 +00:00
let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system);
2020-04-17 19:59:07 +00:00
//eprintln!("get who: {:?}", who);
2020-04-08 20:00:41 +00:00
2020-04-18 17:21:34 +00:00
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
let resps = self
2020-04-18 17:21:34 +00:00
.rpc_client
.try_call_many(
&who[..],
rpc,
RequestStrategy::with_quorum(self.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
2020-04-08 21:01:49 +00:00
.await?;
2020-04-08 21:47:34 +00:00
let mut ret = None;
2020-04-09 18:58:39 +00:00
let mut not_all_same = false;
2020-04-08 21:01:49 +00:00
for resp in resps {
2020-04-08 21:47:34 +00:00
if let TableRPC::ReadEntryResponse(value) = resp {
if let Some(v_bytes) = value {
let v = self.decode_entry(v_bytes.as_slice())?;
2020-04-08 21:47:34 +00:00
ret = match ret {
None => Some(v),
Some(mut x) => {
2020-04-09 21:45:07 +00:00
if x != v {
2020-04-09 18:58:39 +00:00
not_all_same = true;
2020-04-09 21:45:07 +00:00
x.merge(&v);
2020-04-09 18:58:39 +00:00
}
2020-04-08 21:47:34 +00:00
Some(x)
}
}
2020-04-08 21:01:49 +00:00
}
2020-04-08 21:47:34 +00:00
} else {
return Err(Error::Message(format!("Invalid return value to read")));
2020-04-08 21:01:49 +00:00
}
}
2020-04-09 18:58:39 +00:00
if let Some(ret_entry) = &ret {
if not_all_same {
let self2 = self.clone();
let ent2 = ret_entry.clone();
2020-04-16 12:50:49 +00:00
self.system
.background
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
2020-04-09 18:58:39 +00:00
}
}
2020-04-08 21:47:34 +00:00
Ok(ret)
2020-04-08 21:01:49 +00:00
}
pub async fn get_range(
self: &Arc<Self>,
partition_key: &F::P,
begin_sort_key: Option<F::S>,
filter: Option<F::Filter>,
limit: usize,
) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system);
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
let resps = self
2020-04-18 17:21:34 +00:00
.rpc_client
.try_call_many(
&who[..],
rpc,
RequestStrategy::with_quorum(self.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
.await?;
let mut ret = BTreeMap::new();
let mut to_repair = BTreeMap::new();
for resp in resps {
if let TableRPC::Update(entries) = resp {
for entry_bytes in entries.iter() {
let entry = self.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.tree_key(entry.partition_key(), entry.sort_key());
match ret.remove(&entry_key) {
None => {
ret.insert(entry_key, Some(entry));
}
Some(Some(mut prev)) => {
let must_repair = prev != entry;
prev.merge(&entry);
if must_repair {
to_repair.insert(entry_key.clone(), Some(prev.clone()));
}
ret.insert(entry_key, Some(prev));
}
Some(None) => unreachable!(),
}
}
}
}
if !to_repair.is_empty() {
let self2 = self.clone();
self.system.background.spawn_cancellable(async move {
for (_, v) in to_repair.iter_mut() {
self2.repair_on_read(&who[..], v.take().unwrap()).await?;
}
Ok(())
});
}
let ret_vec = ret
.iter_mut()
.take(limit)
.map(|(_k, v)| v.take().unwrap())
.collect::<Vec<_>>();
Ok(ret_vec)
}
// =============== UTILITY FUNCTION FOR CLIENT OPERATIONS ===============
async fn repair_on_read(&self, who: &[UUID], what: F::E) -> Result<(), Error> {
2020-04-16 12:50:49 +00:00
let what_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(&what)?));
2020-04-18 17:21:34 +00:00
self.rpc_client
.try_call_many(
&who[..],
TableRPC::<F>::Update(vec![what_enc]),
RequestStrategy::with_quorum(who.len()).with_timeout(TABLE_RPC_TIMEOUT),
2020-04-18 17:21:34 +00:00
)
2020-04-16 12:50:49 +00:00
.await?;
Ok(())
}
// =============== HANDLERS FOR RPC OPERATIONS (SERVER SIDE) ==============
2020-04-18 17:21:34 +00:00
fn register_handler(self: Arc<Self>, rpc_server: &mut RpcServer, path: String) {
let self2 = self.clone();
2020-04-18 17:21:34 +00:00
rpc_server.add_handler::<TableRPC<F>, _, _>(path, move |msg, _addr| {
let self2 = self2.clone();
async move { self2.handle(&msg).await }
});
let self2 = self.clone();
self.rpc_client
.set_local_handler(self.system.id, move |msg| {
let self2 = self2.clone();
async move { self2.handle(&msg).await }
});
}
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
2020-04-08 21:01:49 +00:00
match msg {
2020-04-08 21:47:34 +00:00
TableRPC::ReadEntry(key, sort_key) => {
let value = self.handle_read_entry(key, sort_key)?;
2020-04-08 21:47:34 +00:00
Ok(TableRPC::ReadEntryResponse(value))
2020-04-08 21:01:49 +00:00
}
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
let values = self.handle_read_range(key, begin_sort_key, filter, *limit)?;
Ok(TableRPC::Update(values))
}
2020-04-08 21:01:49 +00:00
TableRPC::Update(pairs) => {
2020-04-08 21:47:34 +00:00
self.handle_update(pairs).await?;
2020-04-08 21:01:49 +00:00
Ok(TableRPC::Ok)
}
TableRPC::SyncRPC(rpc) => {
2020-04-17 16:27:29 +00:00
let syncer = self.syncer.load_full().unwrap();
let response = syncer
.handle_rpc(rpc, self.system.background.stop_signal.clone())
.await?;
Ok(TableRPC::SyncRPC(response))
2020-04-16 16:41:10 +00:00
}
2020-11-08 14:04:30 +00:00
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
2020-04-08 21:01:49 +00:00
}
}
fn handle_read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
2020-04-09 14:16:27 +00:00
let tree_key = self.tree_key(p, s);
2020-04-08 21:47:34 +00:00
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
2020-04-08 21:47:34 +00:00
} else {
Ok(None)
2020-04-08 21:01:49 +00:00
}
}
fn handle_read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
pub async fn handle_update(self: &Arc<Self>, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
let syncer = self.syncer.load_full().unwrap();
for update_bytes in entries.iter() {
let update = self.decode_entry(update_bytes.as_slice())?;
2020-04-09 18:58:39 +00:00
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let (old_entry, new_entry) = self.store.transaction(|db| {
2020-04-09 21:45:07 +00:00
let (old_entry, new_entry) = match db.get(&tree_key)? {
2020-04-09 18:58:39 +00:00
Some(prev_bytes) => {
2020-12-05 18:23:46 +00:00
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
2020-04-09 21:45:07 +00:00
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
2020-04-09 18:58:39 +00:00
}
None => (None, update.clone()),
2020-04-09 18:58:39 +00:00
};
2020-04-08 21:01:49 +00:00
2020-04-09 18:58:39 +00:00
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
2020-04-09 18:58:39 +00:00
db.insert(tree_key.clone(), new_bytes)?;
Ok((old_entry, new_entry))
2020-04-09 18:58:39 +00:00
})?;
2020-04-08 21:01:49 +00:00
if old_entry.as_ref() != Some(&new_entry) {
self.instance.updated(old_entry, Some(new_entry));
syncer.invalidate(&tree_key[..]);
2020-04-16 17:28:02 +00:00
}
2020-04-08 21:01:49 +00:00
}
2020-04-08 21:01:49 +00:00
Ok(())
2020-04-08 20:00:41 +00:00
}
2020-04-09 14:16:27 +00:00
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = self.store.transaction(|txn| {
2021-02-23 21:45:36 +00:00
if let Some(cur_v) = txn.get(k)? {
if cur_v == v {
2021-02-24 10:05:59 +00:00
txn.remove(k)?;
return Ok(true);
}
2020-04-17 12:49:10 +00:00
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.syncer.load_full().unwrap().invalidate(k);
2020-04-17 12:49:10 +00:00
}
Ok(removed)
2020-04-16 16:41:10 +00:00
}
2020-04-09 14:16:27 +00:00
fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
2020-07-08 14:10:53 +00:00
fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
2020-07-08 15:34:37 +00:00
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
2020-11-20 22:37:34 +00:00
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
2020-07-08 15:34:37 +00:00
},
}
}
2020-04-08 20:00:41 +00:00
}