garage/src/table.rs

306 lines
7.0 KiB
Rust
Raw Normal View History

2020-04-08 20:00:41 +00:00
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::time::Duration;
2020-04-08 20:00:41 +00:00
use crate::data::*;
use crate::error::Error;
2020-04-08 20:00:41 +00:00
use crate::membership::System;
use crate::proto::*;
2020-04-08 20:00:41 +00:00
use crate::rpc_client::*;
pub struct Table<F: TableFormat> {
2020-04-08 21:01:49 +00:00
pub instance: F,
2020-04-08 20:00:41 +00:00
pub name: String,
pub system: Arc<System>,
pub store: sled::Tree,
pub partitions: Vec<Partition>,
pub param: TableReplicationParams,
}
#[derive(Clone)]
pub struct TableReplicationParams {
pub replication_factor: usize,
pub read_quorum: usize,
pub write_quorum: usize,
pub timeout: Duration,
}
#[async_trait]
pub trait TableRpcHandler {
async fn handle(&self, rpc: &[u8]) -> Result<Vec<u8>, Error>;
}
struct TableRpcHandlerAdapter<F: TableFormat> {
table: Arc<Table<F>>,
}
#[async_trait]
impl<F: TableFormat + 'static> TableRpcHandler for TableRpcHandlerAdapter<F> {
async fn handle(&self, rpc: &[u8]) -> Result<Vec<u8>, Error> {
let msg = rmp_serde::decode::from_read_ref::<_, TableRPC<F>>(rpc)?;
let rep = self.table.handle(msg).await?;
2020-04-09 16:43:53 +00:00
Ok(rmp_to_vec_all_named(&rep)?)
2020-04-08 20:00:41 +00:00
}
}
2020-04-08 21:01:49 +00:00
#[derive(Serialize, Deserialize)]
2020-04-08 20:00:41 +00:00
pub enum TableRPC<F: TableFormat> {
2020-04-08 21:01:49 +00:00
Ok,
2020-04-08 21:47:34 +00:00
2020-04-09 14:16:27 +00:00
ReadEntry(F::P, F::S),
ReadEntryResponse(Option<F::E>),
2020-04-08 21:47:34 +00:00
2020-04-09 14:16:27 +00:00
Update(Vec<F::E>),
2020-04-08 20:00:41 +00:00
}
pub struct Partition {
pub begin: Hash,
pub end: Hash,
pub other_nodes: Vec<UUID>,
}
2020-04-09 15:32:28 +00:00
pub trait PartitionKey {
2020-04-08 20:00:41 +00:00
fn hash(&self) -> Hash;
}
2020-04-09 15:32:28 +00:00
pub trait SortKey {
2020-04-09 14:16:27 +00:00
fn sort_key(&self) -> &[u8];
}
pub trait Entry<P: PartitionKey, S: SortKey>:
PartialEq + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync
{
2020-04-09 14:16:27 +00:00
fn partition_key(&self) -> &P;
fn sort_key(&self) -> &S;
2020-04-09 21:45:07 +00:00
fn merge(&mut self, other: &Self);
2020-04-08 20:00:41 +00:00
}
2020-04-09 14:16:27 +00:00
#[derive(Clone, Serialize, Deserialize)]
pub struct EmptySortKey;
impl SortKey for EmptySortKey {
fn sort_key(&self) -> &[u8] {
&[]
}
}
2020-04-09 15:32:28 +00:00
impl<T: AsRef<str>> PartitionKey for T {
2020-04-09 14:16:27 +00:00
fn hash(&self) -> Hash {
2020-04-09 15:32:28 +00:00
hash(self.as_ref().as_bytes())
2020-04-09 14:16:27 +00:00
}
}
2020-04-09 15:32:28 +00:00
impl<T: AsRef<str>> SortKey for T {
2020-04-09 14:16:27 +00:00
fn sort_key(&self) -> &[u8] {
2020-04-09 15:32:28 +00:00
self.as_ref().as_bytes()
2020-04-09 14:16:27 +00:00
}
}
2020-04-09 21:45:07 +00:00
impl PartitionKey for Hash {
fn hash(&self) -> Hash {
self.clone()
}
}
2020-04-08 20:00:41 +00:00
#[async_trait]
pub trait TableFormat: Send + Sync {
2020-04-09 15:32:28 +00:00
type P: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
type S: SortKey + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync;
2020-04-09 14:16:27 +00:00
type E: Entry<Self::P, Self::S>;
2020-04-08 20:00:41 +00:00
2020-04-09 14:16:27 +00:00
async fn updated(&self, old: Option<&Self::E>, new: &Self::E);
2020-04-08 20:00:41 +00:00
}
impl<F: TableFormat + 'static> Table<F> {
pub fn new(
instance: F,
system: Arc<System>,
db: &sled::Db,
name: String,
param: TableReplicationParams,
) -> Self {
let store = db.open_tree(&name).expect("Unable to open DB tree");
Self {
2020-04-08 21:01:49 +00:00
instance,
2020-04-08 20:00:41 +00:00
name,
system,
store,
partitions: Vec::new(),
param,
}
}
pub fn rpc_handler(self: Arc<Self>) -> Box<dyn TableRpcHandler + Send + Sync> {
Box::new(TableRpcHandlerAdapter::<F> { table: self })
2020-04-08 20:00:41 +00:00
}
2020-04-09 14:16:27 +00:00
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash();
let who = self
.system
.members
.read()
.await
2020-04-08 21:01:49 +00:00
.walk_ring(&hash, self.param.replication_factor);
2020-04-09 16:43:53 +00:00
eprintln!("insert who: {:?}", who);
2020-04-08 21:01:49 +00:00
2020-04-09 14:16:27 +00:00
let rpc = &TableRPC::<F>::Update(vec![e.clone()]);
self.rpc_try_call_many(&who[..], &rpc, self.param.write_quorum)
.await?;
2020-04-08 21:01:49 +00:00
Ok(())
}
2020-04-08 20:00:41 +00:00
2020-04-09 14:16:27 +00:00
pub async fn get(&self, partition_key: &F::P, sort_key: &F::S) -> Result<Option<F::E>, Error> {
let hash = partition_key.hash();
let who = self
.system
.members
.read()
.await
2020-04-08 20:00:41 +00:00
.walk_ring(&hash, self.param.replication_factor);
2020-04-09 16:43:53 +00:00
eprintln!("get who: {:?}", who);
2020-04-08 20:00:41 +00:00
2020-04-09 14:16:27 +00:00
let rpc = &TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
let resps = self
.rpc_try_call_many(&who[..], &rpc, self.param.read_quorum)
2020-04-08 21:01:49 +00:00
.await?;
2020-04-08 21:47:34 +00:00
let mut ret = None;
2020-04-09 18:58:39 +00:00
let mut not_all_same = false;
2020-04-08 21:01:49 +00:00
for resp in resps {
2020-04-08 21:47:34 +00:00
if let TableRPC::ReadEntryResponse(value) = resp {
if let Some(v) = value {
ret = match ret {
None => Some(v),
Some(mut x) => {
2020-04-09 21:45:07 +00:00
if x != v {
2020-04-09 18:58:39 +00:00
not_all_same = true;
2020-04-09 21:45:07 +00:00
x.merge(&v);
2020-04-09 18:58:39 +00:00
}
2020-04-08 21:47:34 +00:00
Some(x)
}
}
2020-04-08 21:01:49 +00:00
}
2020-04-08 21:47:34 +00:00
} else {
return Err(Error::Message(format!("Invalid return value to read")));
2020-04-08 21:01:49 +00:00
}
}
2020-04-09 18:58:39 +00:00
if let Some(ret_entry) = &ret {
if not_all_same {
// Repair on read
let _: Result<_, _> = self
.rpc_try_call_many(
2020-04-09 18:58:39 +00:00
&who[..],
&TableRPC::<F>::Update(vec![ret_entry.clone()]),
who.len(),
)
2020-04-09 18:58:39 +00:00
.await;
}
}
2020-04-08 21:47:34 +00:00
Ok(ret)
2020-04-08 21:01:49 +00:00
}
async fn rpc_try_call_many(
&self,
who: &[UUID],
rpc: &TableRPC<F>,
quorum: usize,
) -> Result<Vec<TableRPC<F>>, Error> {
2020-04-09 16:43:53 +00:00
eprintln!("Table RPC to {:?}: {}", who, serde_json::to_string(&rpc)?);
let rpc_bytes = rmp_to_vec_all_named(rpc)?;
2020-04-08 21:01:49 +00:00
let rpc_msg = Message::TableRPC(self.name.to_string(), rpc_bytes);
let resps = rpc_try_call_many(
self.system.clone(),
who,
&rpc_msg,
quorum,
self.param.timeout,
)
.await?;
2020-04-08 21:01:49 +00:00
let mut resps_vals = vec![];
for resp in resps {
if let Message::TableRPC(tbl, rep_by) = &resp {
if *tbl == self.name {
resps_vals.push(rmp_serde::decode::from_read_ref(&rep_by)?);
continue;
}
}
return Err(Error::Message(format!(
"Invalid reply to TableRPC: {:?}",
resp
)));
2020-04-08 21:01:49 +00:00
}
eprintln!(
"Table RPC responses: {}",
serde_json::to_string(&resps_vals)?
);
2020-04-08 21:01:49 +00:00
Ok(resps_vals)
2020-04-08 20:00:41 +00:00
}
async fn handle(&self, msg: TableRPC<F>) -> Result<TableRPC<F>, Error> {
2020-04-08 21:01:49 +00:00
match msg {
2020-04-08 21:47:34 +00:00
TableRPC::ReadEntry(key, sort_key) => {
let value = self.handle_read_entry(&key, &sort_key)?;
Ok(TableRPC::ReadEntryResponse(value))
2020-04-08 21:01:49 +00:00
}
TableRPC::Update(pairs) => {
2020-04-08 21:47:34 +00:00
self.handle_update(pairs).await?;
2020-04-08 21:01:49 +00:00
Ok(TableRPC::Ok)
}
_ => Err(Error::RPCError(format!("Unexpected table RPC"))),
2020-04-08 21:01:49 +00:00
}
}
2020-04-09 14:16:27 +00:00
fn handle_read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<F::E>, Error> {
let tree_key = self.tree_key(p, s);
2020-04-08 21:47:34 +00:00
if let Some(bytes) = self.store.get(&tree_key)? {
2020-04-09 14:16:27 +00:00
let e = rmp_serde::decode::from_read_ref::<_, F::E>(&bytes)?;
Ok(Some(e))
2020-04-08 21:47:34 +00:00
} else {
Ok(None)
2020-04-08 21:01:49 +00:00
}
}
2020-04-09 14:16:27 +00:00
async fn handle_update(&self, mut entries: Vec<F::E>) -> Result<(), Error> {
2020-04-09 18:58:39 +00:00
for update in entries.drain(..) {
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let (old_entry, new_entry) = self.store.transaction(|db| {
2020-04-09 21:45:07 +00:00
let (old_entry, new_entry) = match db.get(&tree_key)? {
2020-04-09 18:58:39 +00:00
Some(prev_bytes) => {
let old_entry = rmp_serde::decode::from_read_ref::<_, F::E>(&prev_bytes)
.map_err(Error::RMPDecode)
.map_err(sled::ConflictableTransactionError::Abort)?;
2020-04-09 21:45:07 +00:00
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
2020-04-09 18:58:39 +00:00
}
None => (None, update.clone()),
2020-04-09 18:58:39 +00:00
};
2020-04-08 21:01:49 +00:00
2020-04-09 18:58:39 +00:00
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::ConflictableTransactionError::Abort)?;
db.insert(tree_key.clone(), new_bytes)?;
Ok((old_entry, new_entry))
})?;
2020-04-08 21:01:49 +00:00
2020-04-09 18:58:39 +00:00
self.instance.updated(old_entry.as_ref(), &new_entry).await;
2020-04-08 21:01:49 +00:00
}
Ok(())
2020-04-08 20:00:41 +00:00
}
2020-04-09 14:16:27 +00:00
fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
2020-04-08 20:00:41 +00:00
}