garage/src/table/table.rs

315 lines
8 KiB
Rust
Raw Normal View History

use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use std::time::Duration;
2020-04-08 20:00:41 +00:00
use futures::stream::*;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
2020-04-24 10:10:01 +00:00
use garage_util::data::*;
use garage_util::error::Error;
2020-04-23 17:05:46 +00:00
2021-02-21 12:11:10 +00:00
use garage_rpc::membership::System;
2020-04-24 10:10:01 +00:00
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
2020-04-23 17:05:46 +00:00
use crate::crdt::CRDT;
2021-03-11 15:54:15 +00:00
use crate::data::*;
2021-03-12 20:52:19 +00:00
use crate::gc::*;
2021-03-11 15:54:15 +00:00
use crate::replication::*;
use crate::schema::*;
use crate::sync::*;
2020-04-08 20:00:41 +00:00
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
2021-03-12 14:07:23 +00:00
pub struct TableAux<R: TableReplication> {
2021-03-11 15:54:15 +00:00
pub system: Arc<System>,
pub replication: R,
2021-03-11 15:54:15 +00:00
}
2020-04-08 20:00:41 +00:00
2021-03-11 15:54:15 +00:00
pub struct Table<F: TableSchema, R: TableReplication> {
pub data: Arc<TableData<F>>,
2021-03-12 14:07:23 +00:00
pub aux: Arc<TableAux<R>>,
2021-03-11 15:54:15 +00:00
pub syncer: Arc<TableSyncer<F, R>>,
2021-03-12 14:07:23 +00:00
rpc_client: Arc<RpcClient<TableRPC<F>>>,
2020-04-08 20:00:41 +00:00
}
2020-04-08 21:01:49 +00:00
#[derive(Serialize, Deserialize)]
pub(crate) enum TableRPC<F: TableSchema> {
2020-04-08 21:01:49 +00:00
Ok,
2020-04-08 21:47:34 +00:00
2020-04-09 14:16:27 +00:00
ReadEntry(F::P, F::S),
ReadEntryResponse(Option<ByteBuf>),
2020-04-08 21:47:34 +00:00
// Read range: read all keys in partition P, possibly starting at a certain sort key offset
ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize),
Update(Vec<Arc<ByteBuf>>),
2020-04-08 20:00:41 +00:00
}
2020-04-18 17:21:34 +00:00
impl<F: TableSchema> RpcMessage for TableRPC<F> {}
impl<F, R> Table<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
// =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) ===============
pub fn new(
instance: F,
replication: R,
system: Arc<System>,
db: &sled::Db,
name: String,
2020-04-18 17:21:34 +00:00
rpc_server: &mut RpcServer,
2020-04-12 20:24:53 +00:00
) -> Arc<Self> {
2020-04-18 17:21:34 +00:00
let rpc_path = format!("table_{}", name);
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
let data = TableData::new(name, instance, db, system.background.clone());
let aux = Arc::new(TableAux {
2021-03-11 15:54:15 +00:00
system,
replication,
2020-04-16 12:50:49 +00:00
});
2020-04-18 17:21:34 +00:00
let syncer = TableSyncer::launch(data.clone(), aux.clone(), rpc_server);
TableGC::launch(data.clone(), aux.clone(), rpc_server);
2020-04-18 17:21:34 +00:00
2021-03-12 14:07:23 +00:00
let table = Arc::new(Self {
data,
aux,
syncer,
rpc_client,
});
2021-03-11 15:54:15 +00:00
table.clone().register_handler(rpc_server, rpc_path);
2020-04-16 12:50:49 +00:00
table
2020-04-08 20:00:41 +00:00
}
2020-04-09 14:16:27 +00:00
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash();
2021-03-16 10:14:27 +00:00
let who = self.aux.replication.write_nodes(&hash);
2020-04-17 19:59:07 +00:00
//eprintln!("insert who: {:?}", who);
2020-04-08 21:01:49 +00:00
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
2020-04-18 17:21:34 +00:00
let rpc = TableRPC::<F>::Update(vec![e_enc]);
2021-03-12 14:07:23 +00:00
self.rpc_client
.try_call_many(
&who[..],
rpc,
2021-03-16 10:14:27 +00:00
RequestStrategy::with_quorum(self.aux.replication.write_quorum())
.with_timeout(TABLE_RPC_TIMEOUT),
)
.await?;
2020-04-08 21:01:49 +00:00
Ok(())
}
2020-04-08 20:00:41 +00:00
pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> {
let mut call_list = HashMap::new();
for entry in entries.iter() {
let hash = entry.partition_key().hash();
2021-03-16 10:14:27 +00:00
let who = self.aux.replication.write_nodes(&hash);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
for node in who {
if !call_list.contains_key(&node) {
call_list.insert(node, vec![]);
}
call_list.get_mut(&node).unwrap().push(e_enc.clone());
}
}
let call_futures = call_list.drain().map(|(node, entries)| async move {
let rpc = TableRPC::<F>::Update(entries);
2021-03-12 14:07:23 +00:00
let resp = self.rpc_client.call(node, rpc, TABLE_RPC_TIMEOUT).await?;
Ok::<_, Error>((node, resp))
});
let mut resps = call_futures.collect::<FuturesUnordered<_>>();
let mut errors = vec![];
while let Some(resp) = resps.next().await {
if let Err(e) = resp {
errors.push(e);
}
}
2021-03-11 15:54:15 +00:00
if errors.len() > self.aux.replication.max_write_errors() {
Err(Error::Message("Too many errors".into()))
} else {
Ok(())
}
}
2020-04-16 12:50:49 +00:00
pub async fn get(
self: &Arc<Self>,
partition_key: &F::P,
sort_key: &F::S,
) -> Result<Option<F::E>, Error> {
2020-04-09 14:16:27 +00:00
let hash = partition_key.hash();
2021-03-16 10:14:27 +00:00
let who = self.aux.replication.read_nodes(&hash);
2020-04-17 19:59:07 +00:00
//eprintln!("get who: {:?}", who);
2020-04-08 20:00:41 +00:00
2020-04-18 17:21:34 +00:00
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
let resps = self
2020-04-18 17:21:34 +00:00
.rpc_client
.try_call_many(
&who[..],
rpc,
2021-03-11 15:54:15 +00:00
RequestStrategy::with_quorum(self.aux.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
2020-04-08 21:01:49 +00:00
.await?;
2020-04-08 21:47:34 +00:00
let mut ret = None;
2020-04-09 18:58:39 +00:00
let mut not_all_same = false;
2020-04-08 21:01:49 +00:00
for resp in resps {
2020-04-08 21:47:34 +00:00
if let TableRPC::ReadEntryResponse(value) = resp {
if let Some(v_bytes) = value {
2021-03-11 15:54:15 +00:00
let v = self.data.decode_entry(v_bytes.as_slice())?;
2020-04-08 21:47:34 +00:00
ret = match ret {
None => Some(v),
Some(mut x) => {
2020-04-09 21:45:07 +00:00
if x != v {
2020-04-09 18:58:39 +00:00
not_all_same = true;
2020-04-09 21:45:07 +00:00
x.merge(&v);
2020-04-09 18:58:39 +00:00
}
2020-04-08 21:47:34 +00:00
Some(x)
}
}
2020-04-08 21:01:49 +00:00
}
2020-04-08 21:47:34 +00:00
} else {
return Err(Error::Message(format!("Invalid return value to read")));
2020-04-08 21:01:49 +00:00
}
}
2020-04-09 18:58:39 +00:00
if let Some(ret_entry) = &ret {
if not_all_same {
let self2 = self.clone();
let ent2 = ret_entry.clone();
self.aux
.system
2020-04-16 12:50:49 +00:00
.background
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
2020-04-09 18:58:39 +00:00
}
}
2020-04-08 21:47:34 +00:00
Ok(ret)
2020-04-08 21:01:49 +00:00
}
pub async fn get_range(
self: &Arc<Self>,
partition_key: &F::P,
begin_sort_key: Option<F::S>,
filter: Option<F::Filter>,
limit: usize,
) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash();
2021-03-16 10:14:27 +00:00
let who = self.aux.replication.read_nodes(&hash);
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
let resps = self
.rpc_client
.try_call_many(
&who[..],
rpc,
2021-03-11 15:54:15 +00:00
RequestStrategy::with_quorum(self.aux.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
.await?;
let mut ret = BTreeMap::new();
let mut to_repair = BTreeMap::new();
for resp in resps {
if let TableRPC::Update(entries) = resp {
for entry_bytes in entries.iter() {
2021-03-11 15:54:15 +00:00
let entry = self.data.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
match ret.remove(&entry_key) {
None => {
ret.insert(entry_key, Some(entry));
}
Some(Some(mut prev)) => {
let must_repair = prev != entry;
prev.merge(&entry);
if must_repair {
to_repair.insert(entry_key.clone(), Some(prev.clone()));
}
ret.insert(entry_key, Some(prev));
}
Some(None) => unreachable!(),
}
}
}
}
if !to_repair.is_empty() {
let self2 = self.clone();
2021-03-11 15:54:15 +00:00
self.aux.system.background.spawn_cancellable(async move {
for (_, v) in to_repair.iter_mut() {
self2.repair_on_read(&who[..], v.take().unwrap()).await?;
}
Ok(())
});
}
let ret_vec = ret
.iter_mut()
.take(limit)
.map(|(_k, v)| v.take().unwrap())
.collect::<Vec<_>>();
Ok(ret_vec)
}
// =============== UTILITY FUNCTION FOR CLIENT OPERATIONS ===============
async fn repair_on_read(&self, who: &[UUID], what: F::E) -> Result<(), Error> {
2020-04-16 12:50:49 +00:00
let what_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(&what)?));
2021-03-12 14:07:23 +00:00
self.rpc_client
2020-04-18 17:21:34 +00:00
.try_call_many(
&who[..],
TableRPC::<F>::Update(vec![what_enc]),
RequestStrategy::with_quorum(who.len()).with_timeout(TABLE_RPC_TIMEOUT),
2020-04-18 17:21:34 +00:00
)
2020-04-16 12:50:49 +00:00
.await?;
Ok(())
}
// =============== HANDLERS FOR RPC OPERATIONS (SERVER SIDE) ==============
2020-04-18 17:21:34 +00:00
fn register_handler(self: Arc<Self>, rpc_server: &mut RpcServer, path: String) {
let self2 = self.clone();
2020-04-18 17:21:34 +00:00
rpc_server.add_handler::<TableRPC<F>, _, _>(path, move |msg, _addr| {
let self2 = self2.clone();
async move { self2.handle(&msg).await }
});
let self2 = self.clone();
2021-03-12 14:07:23 +00:00
self.rpc_client
2021-03-11 15:54:15 +00:00
.set_local_handler(self.aux.system.id, move |msg| {
let self2 = self2.clone();
async move { self2.handle(&msg).await }
});
}
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
2020-04-08 21:01:49 +00:00
match msg {
2020-04-08 21:47:34 +00:00
TableRPC::ReadEntry(key, sort_key) => {
2021-03-11 15:54:15 +00:00
let value = self.data.read_entry(key, sort_key)?;
2020-04-08 21:47:34 +00:00
Ok(TableRPC::ReadEntryResponse(value))
2020-04-08 21:01:49 +00:00
}
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
2021-03-11 15:54:15 +00:00
let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
Ok(TableRPC::Update(values))
}
2020-04-08 21:01:49 +00:00
TableRPC::Update(pairs) => {
2021-03-11 15:54:15 +00:00
self.data.update_many(pairs)?;
2020-04-08 21:01:49 +00:00
Ok(TableRPC::Ok)
}
2020-11-08 14:04:30 +00:00
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
2020-04-08 21:01:49 +00:00
}
}
2020-04-08 20:00:41 +00:00
}