WIP add content defined chunking #42

Closed
trinity-1686a wants to merge 42 commits from content-defined-chunking into master
60 changed files with 3935 additions and 3397 deletions

View file

@ -12,7 +12,7 @@ steps:
image: alpine/git
commands:
- mkdir -p cargo
- git clone https://git.deuxfleurs.fr/Deuxfleurs/garage.git
- git clone $DRONE_GIT_HTTP_URL
- cd garage
- git checkout $DRONE_COMMIT
@ -34,8 +34,8 @@ steps:
- 'garage/target'
- 'cargo/registry/index'
- 'cargo/registry/cache'
- 'cargo/git/db'
- 'cargo/bin'
- 'cargo/git/db'
path_style: true
endpoint: https://garage.deuxfleurs.fr

809
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -4,7 +4,7 @@ DOCKER=lxpz/garage_amd64
all:
#cargo fmt || true
#RUSTFLAGS="-C link-arg=-fuse-ld=lld" cargo build
cargo build
clear; cargo build
$(BIN):
#RUSTFLAGS="-C link-arg=-fuse-ld=lld" cargo build --release

View file

@ -11,7 +11,7 @@ PATH="${GARAGE_DEBUG}:${GARAGE_RELEASE}:$PATH"
garage bucket create eprouvette
KEY_INFO=`garage key new --name opérateur`
ACCESS_KEY=`echo $KEY_INFO|grep -Po 'GK[a-f0-9]+'`
SECRET_KEY=`echo $KEY_INFO|grep -Po 'secret_key: "[a-f0-9]+'|grep -Po '[a-f0-9]+$'`
SECRET_KEY=`echo $KEY_INFO|grep -Po 'Secret key: [a-f0-9]+'|grep -Po '[a-f0-9]+$'`
garage bucket allow eprouvette --read --write --key $ACCESS_KEY
echo "$ACCESS_KEY $SECRET_KEY" > /tmp/garage.s3

View file

@ -17,26 +17,27 @@ garage_util = { version = "0.1.1", path = "../util" }
garage_table = { version = "0.1.1", path = "../table" }
garage_model = { version = "0.1.1", path = "../model" }
err-derive = "0.2.3"
bytes = "0.4"
hex = "0.3"
base64 = "0.13"
log = "0.4"
bytes = "1.0"
chrono = "0.4"
md-5 = "0.9.1"
sha2 = "0.8"
hmac = "0.7"
crypto-mac = "0.7"
crypto-mac = "0.10"
err-derive = "0.3"
fastcdc = "1.0.5"
hex = "0.4"
hmac = "0.10"
log = "0.4"
md-5 = "0.9"
rand = "0.7"
sha2 = "0.9"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
http = "0.2"
hyper = "^0.13.6"
hyper = "0.14"
url = "2.1"
httpdate = "0.3"
percent-encoding = "2.1.0"
roxmltree = "0.11"
roxmltree = "0.14"
http-range = "0.1"

View file

@ -137,7 +137,10 @@ async fn handler_inner(garage: Arc<Garage>, req: Request<Body>) -> Result<Respon
)));
}
let source_key = source_key.ok_or_bad_request("No source key specified")?;
Ok(handle_copy(garage, &bucket, &key, &source_bucket, &source_key).await?)
Ok(
handle_copy(garage, &req, &bucket, &key, &source_bucket, &source_key)
.await?,
)
} else {
// PutObject query
Ok(handle_put(garage, req, &bucket, &key, content_sha256).await?)

View file

@ -33,7 +33,7 @@ pub enum Error {
InvalidBase64(#[error(source)] base64::DecodeError),
#[error(display = "Invalid XML: {}", _0)]
InvalidXML(#[error(source)] roxmltree::Error),
InvalidXML(String),
#[error(display = "Invalid header value: {}", _0)]
InvalidHeader(#[error(source)] hyper::header::ToStrError),
@ -45,6 +45,12 @@ pub enum Error {
BadRequest(String),
}
impl From<roxmltree::Error> for Error {
fn from(err: roxmltree::Error) -> Self {
Self::InvalidXML(format!("{}", err))
}
}
impl Error {
pub fn http_status_code(&self) -> StatusCode {
match self {

View file

@ -1,11 +1,11 @@
use std::fmt::Write;
use std::sync::Arc;
use chrono::{SecondsFormat, Utc};
use hyper::{Body, Response};
use hyper::{Body, Request, Response};
use garage_table::*;
use garage_util::data::*;
use garage_util::time::*;
use garage_model::block_ref_table::*;
use garage_model::garage::Garage;
@ -13,9 +13,11 @@ use garage_model::object_table::*;
use garage_model::version_table::*;
use crate::error::*;
use crate::s3_put::get_headers;
pub async fn handle_copy(
garage: Arc<Garage>,
req: &Request<Body>,
dest_bucket: &str,
dest_key: &str,
source_bucket: &str,
@ -41,17 +43,37 @@ pub async fn handle_copy(
};
let new_uuid = gen_uuid();
let dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: now_msec(),
state: ObjectVersionState::Complete(source_last_state.clone()),
};
let new_timestamp = now_msec();
match &source_last_state {
// Implement x-amz-metadata-directive: REPLACE
let old_meta = match source_last_state {
ObjectVersionData::DeleteMarker => {
return Err(Error::NotFound);
}
ObjectVersionData::Inline(_meta, _bytes) => {
ObjectVersionData::Inline(meta, _bytes) => meta,
ObjectVersionData::FirstBlock(meta, _fbh) => meta,
};
let new_meta = match req.headers().get("x-amz-metadata-directive") {
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta {
headers: get_headers(req)?,
size: old_meta.size,
etag: old_meta.etag.clone(),
},
_ => old_meta.clone(),
};
// Save object copy
match source_last_state {
ObjectVersionData::DeleteMarker => unreachable!(),
ObjectVersionData::Inline(_meta, bytes) => {
let dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: new_timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::Inline(
new_meta,
bytes.clone(),
)),
};
let dest_object = Object::new(
dest_bucket.to_string(),
dest_key.to_string(),
@ -59,44 +81,84 @@ pub async fn handle_copy(
);
garage.object_table.insert(&dest_object).await?;
}
ObjectVersionData::FirstBlock(_meta, _first_block_hash) => {
ObjectVersionData::FirstBlock(_meta, first_block_hash) => {
// Get block list from source version
let source_version = garage
.version_table
.get(&source_last_v.uuid, &EmptyKey)
.await?;
let source_version = source_version.ok_or(Error::NotFound)?;
let dest_version = Version::new(
// Write an "uploading" marker in Object table
// This holds a reference to the object in the Version table
// so that it won't be deleted, e.g. by repair_versions.
let tmp_dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: new_timestamp,
state: ObjectVersionState::Uploading(new_meta.headers.clone()),
};
let tmp_dest_object = Object::new(
dest_bucket.to_string(),
dest_key.to_string(),
vec![tmp_dest_object_version],
);
garage.object_table.insert(&tmp_dest_object).await?;
// Write version in the version table. Even with empty block list,
// this means that the BlockRef entries linked to this version cannot be
// marked as deleted (they are marked as deleted only if the Version
// doesn't exist or is marked as deleted).
let mut dest_version = Version::new(
new_uuid,
dest_bucket.to_string(),
dest_key.to_string(),
false,
source_version.blocks().to_vec(),
);
garage.version_table.insert(&dest_version).await?;
// Fill in block list for version and insert block refs
for (bk, bv) in source_version.blocks.items().iter() {
dest_version.blocks.put(*bk, *bv);
}
let dest_block_refs = dest_version
.blocks
.items()
.iter()
.map(|b| BlockRef {
block: b.1.hash,
version: new_uuid,
deleted: false.into(),
})
.collect::<Vec<_>>();
futures::try_join!(
garage.version_table.insert(&dest_version),
garage.block_ref_table.insert_many(&dest_block_refs[..]),
)?;
// Insert final object
// We do this last because otherwise there is a race condition in the case where
// the copy call has the same source and destination (this happens, rclone does
// it to update the modification timestamp for instance). If we did this concurrently
// with the stuff before, the block's reference counts could be decremented before
// they are incremented again for the new version, leading to data being deleted.
let dest_object_version = ObjectVersion {
uuid: new_uuid,
timestamp: new_timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
new_meta,
*first_block_hash,
)),
};
let dest_object = Object::new(
dest_bucket.to_string(),
dest_key.to_string(),
vec![dest_object_version],
);
let dest_block_refs = dest_version
.blocks()
.iter()
.map(|b| BlockRef {
block: b.hash,
version: new_uuid,
deleted: false,
})
.collect::<Vec<_>>();
futures::try_join!(
garage.object_table.insert(&dest_object),
garage.version_table.insert(&dest_version),
garage.block_ref_table.insert_many(&dest_block_refs[..]),
)?;
garage.object_table.insert(&dest_object).await?;
}
}
let now = Utc::now();
let last_modified = now.to_rfc3339_opts(SecondsFormat::Secs, true);
let last_modified = msec_to_rfc3339(new_timestamp);
let mut xml = String::new();
writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap();
writeln!(&mut xml, r#"<CopyObjectResult>"#).unwrap();

View file

@ -4,6 +4,7 @@ use std::sync::Arc;
use hyper::{Body, Request, Response};
use garage_util::data::*;
use garage_util::time::*;
use garage_model::garage::Garage;
use garage_model::object_table::*;
@ -29,16 +30,16 @@ async fn handle_delete_internal(
_ => true,
});
let mut must_delete = None;
let mut version_to_delete = None;
let mut timestamp = now_msec();
for v in interesting_versions {
if v.timestamp + 1 > timestamp || must_delete.is_none() {
must_delete = Some(v.uuid);
if v.timestamp + 1 > timestamp || version_to_delete.is_none() {
version_to_delete = Some(v.uuid);
}
timestamp = std::cmp::max(timestamp, v.timestamp + 1);
}
let deleted_version = must_delete.ok_or(Error::NotFound)?;
let deleted_version = version_to_delete.ok_or(Error::NotFound)?;
let version_uuid = gen_uuid();
@ -47,7 +48,7 @@ async fn handle_delete_internal(
key.into(),
vec![ObjectVersion {
uuid: version_uuid,
timestamp: now_msec(),
timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::DeleteMarker),
}],
);

View file

@ -146,9 +146,10 @@ pub async fn handle_get(
let version = version.ok_or(Error::NotFound)?;
let mut blocks = version
.blocks()
.blocks
.items()
.iter()
.map(|vb| (vb.hash, None))
.map(|(_, vb)| (vb.hash, None))
.collect::<Vec<_>>();
blocks[0].1 = Some(first_block);
@ -219,11 +220,12 @@ pub async fn handle_get_range(
// file (whereas block.offset designates the offset of the block WITHIN THE PART
// block.part_number, which is not the same in the case of a multipart upload)
let mut blocks = Vec::with_capacity(std::cmp::min(
version.blocks().len(),
4 + ((end - begin) / std::cmp::max(version.blocks()[0].size as u64, 1024)) as usize,
version.blocks.len(),
4 + ((end - begin) / std::cmp::max(version.blocks.items()[0].1.size as u64, 1024))
as usize,
));
let mut true_offset = 0;
for b in version.blocks().iter() {
for (_, b) in version.blocks.items().iter() {
if true_offset >= end {
break;
}

View file

@ -2,10 +2,10 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fmt::Write;
use std::sync::Arc;
use chrono::{DateTime, NaiveDateTime, SecondsFormat, Utc};
use hyper::{Body, Response};
use garage_util::error::Error as GarageError;
use garage_util::time::*;
use garage_model::garage::Garage;
use garage_model::object_table::*;
@ -42,7 +42,7 @@ pub fn parse_list_objects_query(
Ok(ListObjectsQuery {
is_v2: params.get("list-type").map(|x| x == "2").unwrap_or(false),
bucket: bucket.to_string(),
delimiter: params.get("delimiter").cloned(),
delimiter: params.get("delimiter").filter(|x| !x.is_empty()).cloned(),
max_keys: params
.get("max-keys")
.map(|x| {
@ -247,9 +247,7 @@ pub async fn handle_list(
}
for (key, info) in result_keys.iter() {
let last_modif = NaiveDateTime::from_timestamp(info.last_modified as i64 / 1000, 0);
let last_modif = DateTime::<Utc>::from_utc(last_modif, Utc);
let last_modif = last_modif.to_rfc3339_opts(SecondsFormat::Millis, true);
let last_modif = msec_to_rfc3339(info.last_modified);
writeln!(&mut xml, "\t<Contents>").unwrap();
writeln!(
&mut xml,

View file

@ -2,14 +2,16 @@ use std::collections::{BTreeMap, VecDeque};
use std::fmt::Write;
use std::sync::Arc;
use fastcdc::{Chunk, FastCDC};
use futures::stream::*;
use hyper::{Body, Request, Response};
use md5::{digest::generic_array::*, Digest as Md5Digest, Md5};
use sha2::{Digest as Sha256Digest, Sha256};
use sha2::Sha256;
use garage_table::*;
use garage_util::data::*;
use garage_util::error::Error as GarageError;
use garage_util::time::*;
use garage_model::block::INLINE_THRESHOLD;
use garage_model::block_ref_table::*;
@ -52,14 +54,14 @@ pub async fn handle_put(
if first_block.len() < INLINE_THRESHOLD {
let mut md5sum = Md5::new();
md5sum.update(&first_block[..]);
let md5sum_arr = md5sum.finalize();
let md5sum_hex = hex::encode(md5sum_arr);
let data_md5sum = md5sum.finalize();
let data_md5sum_hex = hex::encode(data_md5sum);
let sha256sum_hash = sha256sum(&first_block[..]);
let data_sha256sum = sha256sum(&first_block[..]);
ensure_checksum_matches(
md5sum_arr.as_slice(),
sha256sum_hash,
data_md5sum.as_slice(),
data_sha256sum,
content_md5.as_deref(),
content_sha256,
)?;
@ -71,7 +73,7 @@ pub async fn handle_put(
ObjectVersionMeta {
headers,
size: first_block.len() as u64,
etag: md5sum_hex.clone(),
etag: data_md5sum_hex.clone(),
},
first_block,
)),
@ -80,41 +82,45 @@ pub async fn handle_put(
let object = Object::new(bucket.into(), key.into(), vec![object_version]);
garage.object_table.insert(&object).await?;
return Ok(put_response(version_uuid, md5sum_hex));
return Ok(put_response(version_uuid, data_md5sum_hex));
}
// Write version identifier in object table so that we have a trace
// that we are uploading something
let mut object_version = ObjectVersion {
uuid: version_uuid,
timestamp: now_msec(),
timestamp: version_timestamp,
state: ObjectVersionState::Uploading(headers.clone()),
};
let object = Object::new(bucket.into(), key.into(), vec![object_version.clone()]);
garage.object_table.insert(&object).await?;
// Initialize corresponding entry in version table
let version = Version::new(version_uuid, bucket.into(), key.into(), false, vec![]);
let first_block_hash = sha256sum(&first_block[..]);
// Write this entry now, even with empty block list,
// to prevent block_ref entries from being deleted (they can be deleted
// if the reference a version that isn't found in the version table)
let version = Version::new(version_uuid, bucket.into(), key.into(), false);
garage.version_table.insert(&version).await?;
// Transfer data and verify checksum
let first_block_hash = blake2sum(&first_block[..]);
let tx_result = read_and_put_blocks(
&garage,
version,
&version,
1,
first_block,
first_block_hash,
&mut chunker,
)
.await
.and_then(|(total_size, md5sum_arr, sha256sum)| {
.and_then(|(total_size, data_md5sum, data_sha256sum)| {
ensure_checksum_matches(
md5sum_arr.as_slice(),
sha256sum,
data_md5sum.as_slice(),
data_sha256sum,
content_md5.as_deref(),
content_sha256,
)
.map(|()| (total_size, md5sum_arr))
.map(|()| (total_size, data_md5sum))
});
// If something went wrong, clean up
@ -148,13 +154,13 @@ pub async fn handle_put(
/// Validate MD5 sum against content-md5 header
/// and sha256sum against signed content-sha256
fn ensure_checksum_matches(
md5sum: &[u8],
sha256sum: garage_util::data::FixedBytes32,
data_md5sum: &[u8],
data_sha256sum: garage_util::data::FixedBytes32,
content_md5: Option<&str>,
content_sha256: Option<garage_util::data::FixedBytes32>,
) -> Result<(), Error> {
if let Some(expected_sha256) = content_sha256 {
if expected_sha256 != sha256sum {
if expected_sha256 != data_sha256sum {
return Err(Error::BadRequest(format!(
"Unable to validate x-amz-content-sha256"
)));
@ -163,7 +169,7 @@ fn ensure_checksum_matches(
}
}
if let Some(expected_md5) = content_md5 {
if expected_md5.trim_matches('"') != base64::encode(md5sum) {
if expected_md5.trim_matches('"') != base64::encode(data_md5sum) {
return Err(Error::BadRequest(format!("Unable to validate content-md5")));
} else {
trace!("Successfully validated content-md5");
@ -173,8 +179,8 @@ fn ensure_checksum_matches(
}
async fn read_and_put_blocks(
garage: &Arc<Garage>,
version: Version,
garage: &Garage,
version: &Version,
part_number: u64,
first_block: Vec<u8>,
first_block_hash: Hash,
@ -183,11 +189,11 @@ async fn read_and_put_blocks(
let mut md5hasher = Md5::new();
let mut sha256hasher = Sha256::new();
md5hasher.update(&first_block[..]);
sha256hasher.input(&first_block[..]);
sha256hasher.update(&first_block[..]);
let mut next_offset = first_block.len();
let mut put_curr_version_block = put_block_meta(
garage.clone(),
&garage,
&version,
part_number,
0,
@ -203,11 +209,11 @@ async fn read_and_put_blocks(
futures::try_join!(put_curr_block, put_curr_version_block, chunker.next())?;
if let Some(block) = next_block {
md5hasher.update(&block[..]);
sha256hasher.input(&block[..]);
let block_hash = sha256sum(&block[..]);
sha256hasher.update(&block[..]);
let block_hash = blake2sum(&block[..]);
let block_len = block.len();
put_curr_version_block = put_block_meta(
garage.clone(),
&garage,
&version,
part_number,
next_offset as u64,
@ -222,39 +228,35 @@ async fn read_and_put_blocks(
}
let total_size = next_offset as u64;
let md5sum_arr = md5hasher.finalize();
let data_md5sum = md5hasher.finalize();
let sha256sum_arr = sha256hasher.result();
let mut hash = [0u8; 32];
hash.copy_from_slice(&sha256sum_arr[..]);
let sha256sum_arr = Hash::from(hash);
let data_sha256sum = sha256hasher.finalize();
let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap();
Ok((total_size, md5sum_arr, sha256sum_arr))
Ok((total_size, data_md5sum, data_sha256sum))
}
async fn put_block_meta(
garage: Arc<Garage>,
garage: &Garage,
version: &Version,
part_number: u64,
offset: u64,
hash: Hash,
size: u64,
) -> Result<(), GarageError> {
// TODO: don't clone, restart from empty block list ??
let mut version = version.clone();
version
.add_block(VersionBlock {
version.blocks.put(
VersionBlockKey {
part_number,
offset,
hash,
size,
})
.unwrap();
},
VersionBlock { hash, size },
);
let block_ref = BlockRef {
block: hash,
version: version.uuid,
deleted: false,
deleted: false.into(),
};
futures::try_join!(
@ -267,21 +269,34 @@ async fn put_block_meta(
struct BodyChunker {
body: Body,
read_all: bool,
block_size: usize,
min_block_size: usize,
avg_block_size: usize,
max_block_size: usize,
buf: VecDeque<u8>,
chunk_buf: VecDeque<Chunk>,
}
impl BodyChunker {
fn new(body: Body, block_size: usize) -> Self {
let min_block_size = block_size / 4 * 3;
let avg_block_size = block_size;
let max_block_size = block_size * 2;
Self {
body,
read_all: false,
block_size,
buf: VecDeque::with_capacity(2 * block_size),
min_block_size,
avg_block_size,
max_block_size,
buf: VecDeque::with_capacity(2 * max_block_size),
chunk_buf: VecDeque::new(),
}
}
async fn next(&mut self) -> Result<Option<Vec<u8>>, GarageError> {
while !self.read_all && self.buf.len() < self.block_size {
if let Some(Chunk {length, ..}) = self.chunk_buf.pop_front() {
let block = self.buf.drain(..length).collect::<Vec<u8>>();
return Ok(Some(block));
}
while !self.read_all && self.buf.len() < self.max_block_size {
if let Some(block) = self.body.next().await {
let bytes = block?;
trace!("Body next: {} bytes", bytes.len());
@ -292,12 +307,14 @@ impl BodyChunker {
}
if self.buf.len() == 0 {
Ok(None)
} else if self.buf.len() <= self.block_size {
let block = self.buf.drain(..).collect::<Vec<u8>>();
Ok(Some(block))
} else {
let block = self.buf.drain(..self.block_size).collect::<Vec<u8>>();
Ok(Some(block))
let mut iter = FastCDC::with_eof(self.buf.make_contiguous(), self.min_block_size, self.avg_block_size, self.max_block_size, self.read_all);
if let Some(Chunk {length, ..}) = iter.next() {
let block = self.buf.drain(..length).collect::<Vec<u8>>();
Ok(Some(block))
} else {
Ok(None)
}
}
}
}
@ -319,6 +336,7 @@ pub async fn handle_create_multipart_upload(
let version_uuid = gen_uuid();
let headers = get_headers(req)?;
// Create object in object table
let object_version = ObjectVersion {
uuid: version_uuid,
timestamp: now_msec(),
@ -327,6 +345,14 @@ pub async fn handle_create_multipart_upload(
let object = Object::new(bucket.to_string(), key.to_string(), vec![object_version]);
garage.object_table.insert(&object).await?;
// Insert empty version so that block_ref entries refer to something
// (they are inserted concurrently with blocks in the version table, so
// there is the possibility that they are inserted before the version table
// is created, in which case it is allowed to delete them, e.g. in repair_*)
let version = Version::new(version_uuid, bucket.into(), key.into(), false);
garage.version_table.insert(&version).await?;
// Send success response
let mut xml = String::new();
writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap();
writeln!(
@ -389,11 +415,11 @@ pub async fn handle_put_part(
}
// Copy block to store
let version = Version::new(version_uuid, bucket, key, false, vec![]);
let first_block_hash = sha256sum(&first_block[..]);
let (_, md5sum_arr, sha256sum) = read_and_put_blocks(
let version = Version::new(version_uuid, bucket, key, false);
let first_block_hash = blake2sum(&first_block[..]);
let (_, data_md5sum, data_sha256sum) = read_and_put_blocks(
&garage,
version,
&version,
part_number,
first_block,
first_block_hash,
@ -401,15 +427,24 @@ pub async fn handle_put_part(
)
.await?;
// Verify that checksums map
ensure_checksum_matches(
md5sum_arr.as_slice(),
sha256sum,
data_md5sum.as_slice(),
data_sha256sum,
content_md5.as_deref(),
content_sha256,
)?;
// Store part etag in version
let data_md5sum_hex = hex::encode(data_md5sum);
let mut version = version;
version
.parts_etags
.put(part_number, data_md5sum_hex.clone());
garage.version_table.insert(&version).await?;
let response = Response::builder()
.header("ETag", format!("\"{}\"", hex::encode(md5sum_arr)))
.header("ETag", format!("\"{}\"", data_md5sum_hex))
.body(Body::from(vec![]))
.unwrap();
Ok(response)
@ -444,17 +479,15 @@ pub async fn handle_complete_multipart_upload(
)?;
let object = object.ok_or(Error::BadRequest(format!("Object not found")))?;
let object_version = object
let mut object_version = object
.versions()
.iter()
.find(|v| v.uuid == version_uuid && v.is_uploading());
let mut object_version = match object_version {
None => return Err(Error::NotFound),
Some(x) => x.clone(),
};
.find(|v| v.uuid == version_uuid && v.is_uploading())
.cloned()
.ok_or(Error::BadRequest(format!("Version not found")))?;
let version = version.ok_or(Error::BadRequest(format!("Version not found")))?;
if version.blocks().len() == 0 {
if version.blocks.len() == 0 {
return Err(Error::BadRequest(format!("No data was uploaded")));
}
@ -464,53 +497,50 @@ pub async fn handle_complete_multipart_upload(
};
// Check that the list of parts they gave us corresponds to the parts we have here
// TODO: check MD5 sum of all uploaded parts? but that would mean we have to store them somewhere...
let mut parts = version
.blocks()
debug!("Expected parts from request: {:?}", body_list_of_parts);
debug!("Parts stored in version: {:?}", version.parts_etags.items());
let parts = version
.parts_etags
.items()
.iter()
.map(|x| x.part_number)
.collect::<Vec<_>>();
parts.dedup();
.map(|pair| (&pair.0, &pair.1));
let same_parts = body_list_of_parts
.iter()
.map(|x| &x.part_number)
.eq(parts.iter());
.map(|x| (&x.part_number, &x.etag))
.eq(parts);
if !same_parts {
return Err(Error::BadRequest(format!("We don't have the same parts")));
}
// ETag calculation: we produce ETags that have the same form as
// those of S3 multipart uploads, but we don't use their actual
// calculation for the first part (we use random bytes). This
// shouldn't impact compatibility as the S3 docs specify that
// the ETag is an opaque value in case of a multipart upload.
// See also: https://teppen.io/2018/06/23/aws_s3_etags/
let num_parts = version.blocks().last().unwrap().part_number
- version.blocks().first().unwrap().part_number
// Calculate etag of final object
// To understand how etags are calculated, read more here:
// https://teppen.io/2018/06/23/aws_s3_etags/
let num_parts = version.blocks.items().last().unwrap().0.part_number
- version.blocks.items().first().unwrap().0.part_number
+ 1;
let etag = format!(
"{}-{}",
hex::encode(&rand::random::<[u8; 16]>()[..]),
num_parts
);
let mut etag_md5_hasher = Md5::new();
for (_, etag) in version.parts_etags.items().iter() {
etag_md5_hasher.update(etag.as_bytes());
}
let etag = format!("{}-{}", hex::encode(etag_md5_hasher.finalize()), num_parts);
let total_size = version
.blocks()
.iter()
.map(|x| x.size)
.fold(0, |x, y| x + y);
// Calculate total size of final object
let total_size = version.blocks.items().iter().map(|x| x.1.size).sum();
// Write final object version
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
ObjectVersionMeta {
headers,
size: total_size,
etag: etag,
etag,
},
version.blocks()[0].hash,
version.blocks.items()[0].1.hash,
));
let final_object = Object::new(bucket.clone(), key.clone(), vec![object_version]);
garage.object_table.insert(&final_object).await?;
// Send response saying ok we're done
let mut xml = String::new();
writeln!(&mut xml, r#"<?xml version="1.0" encoding="UTF-8"?>"#).unwrap();
writeln!(
@ -570,17 +600,19 @@ fn get_mime_type(req: &Request<Body>) -> Result<String, Error> {
.to_string())
}
fn get_headers(req: &Request<Body>) -> Result<ObjectVersionHeaders, Error> {
pub(crate) fn get_headers(req: &Request<Body>) -> Result<ObjectVersionHeaders, Error> {
let content_type = get_mime_type(req)?;
let other_headers = vec![
let mut other = BTreeMap::new();
// Preserve standard headers
let standard_header = vec![
hyper::header::CACHE_CONTROL,
hyper::header::CONTENT_DISPOSITION,
hyper::header::CONTENT_ENCODING,
hyper::header::CONTENT_LANGUAGE,
hyper::header::EXPIRES,
];
let mut other = BTreeMap::new();
for h in other_headers.iter() {
for h in standard_header.iter() {
if let Some(v) = req.headers().get(h) {
match v.to_str() {
Ok(v_str) => {
@ -592,6 +624,21 @@ fn get_headers(req: &Request<Body>) -> Result<ObjectVersionHeaders, Error> {
}
}
}
// Preserve x-amz-meta- headers
for (k, v) in req.headers().iter() {
if k.as_str().starts_with("x-amz-meta-") {
match v.to_str() {
Ok(v_str) => {
other.insert(k.to_string(), v_str.to_string());
}
Err(e) => {
warn!("Discarding header {}, error in .to_str(): {}", k, e);
}
}
}
}
Ok(ObjectVersionHeaders {
content_type,
other,

View file

@ -1,7 +1,7 @@
use std::collections::HashMap;
use chrono::{DateTime, Duration, NaiveDateTime, Utc};
use hmac::{Hmac, Mac};
use hmac::{Hmac, Mac, NewMac};
use hyper::{Body, Method, Request};
use sha2::{Digest, Sha256};
@ -91,8 +91,8 @@ pub async fn check_signature(
"s3",
)
.ok_or_internal_error("Unable to build signing HMAC")?;
hmac.input(string_to_sign.as_bytes());
let signature = hex::encode(hmac.result().code());
hmac.update(string_to_sign.as_bytes());
let signature = hex::encode(hmac.finalize().into_bytes());
if authorization.signature != signature {
trace!("Canonical request: ``{}``", canonical_request);
@ -106,12 +106,10 @@ pub async fn check_signature(
} else {
let bytes = hex::decode(authorization.content_sha256)
.ok_or_bad_request("Invalid content sha256 hash")?;
let mut hash = [0u8; 32];
if bytes.len() != 32 {
return Err(Error::BadRequest(format!("Invalid content sha256 hash")));
}
hash.copy_from_slice(&bytes[..]);
Some(Hash::from(hash))
Some(
Hash::try_from(&bytes[..])
.ok_or(Error::BadRequest(format!("Invalid content sha256 hash")))?,
)
};
Ok((key, content_sha256))
@ -220,12 +218,12 @@ fn parse_credential(cred: &str) -> Result<(String, String), Error> {
fn string_to_sign(datetime: &DateTime<Utc>, scope_string: &str, canonical_req: &str) -> String {
let mut hasher = Sha256::default();
hasher.input(canonical_req.as_bytes());
hasher.update(canonical_req.as_bytes());
[
"AWS4-HMAC-SHA256",
&datetime.format(LONG_DATETIME).to_string(),
scope_string,
&hex::encode(hasher.result().as_slice()),
&hex::encode(hasher.finalize().as_slice()),
]
.join("\n")
}
@ -238,14 +236,14 @@ fn signing_hmac(
) -> Result<HmacSha256, crypto_mac::InvalidKeyLength> {
let secret = String::from("AWS4") + secret_key;
let mut date_hmac = HmacSha256::new_varkey(secret.as_bytes())?;
date_hmac.input(datetime.format(SHORT_DATE).to_string().as_bytes());
let mut region_hmac = HmacSha256::new_varkey(&date_hmac.result().code())?;
region_hmac.input(region.as_bytes());
let mut service_hmac = HmacSha256::new_varkey(&region_hmac.result().code())?;
service_hmac.input(service.as_bytes());
let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.result().code())?;
signing_hmac.input(b"aws4_request");
let hmac = HmacSha256::new_varkey(&signing_hmac.result().code())?;
date_hmac.update(datetime.format(SHORT_DATE).to_string().as_bytes());
let mut region_hmac = HmacSha256::new_varkey(&date_hmac.finalize().into_bytes())?;
region_hmac.update(region.as_bytes());
let mut service_hmac = HmacSha256::new_varkey(&region_hmac.finalize().into_bytes())?;
service_hmac.update(service.as_bytes());
let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.finalize().into_bytes())?;
signing_hmac.update(b"aws4_request");
let hmac = HmacSha256::new_varkey(&signing_hmac.finalize().into_bytes())?;
Ok(hmac)
}

View file

@ -21,21 +21,20 @@ garage_model = { version = "0.1.1", path = "../model" }
garage_api = { version = "0.1.1", path = "../api" }
garage_web = { version = "0.1.1", path = "../web" }
bytes = "0.4"
rand = "0.7"
hex = "0.3"
sha2 = "0.8"
bytes = "1.0"
rand = "0.8"
hex = "0.4"
log = "0.4"
pretty_env_logger = "0.4"
git-version = "0.3.4"
sled = "0.34"
old_sled = { package = "sled", version = "0.31" }
structopt = { version = "0.3", default-features = false }
toml = "0.5"
rmp-serde = "0.14.3"
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
futures = "0.3"
futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

View file

@ -1,3 +1,5 @@
use std::collections::HashMap;
use std::fmt::Write;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
@ -5,6 +7,7 @@ use serde::{Deserialize, Serialize};
use garage_util::error::Error;
use garage_table::crdt::CRDT;
use garage_table::replication::*;
use garage_table::*;
use garage_rpc::rpc_client::*;
@ -14,6 +17,7 @@ use garage_model::bucket_table::*;
use garage_model::garage::Garage;
use garage_model::key_table::*;
use crate::cli::*;
use crate::repair::Repair;
use crate::*;
@ -25,6 +29,7 @@ pub enum AdminRPC {
BucketOperation(BucketOperation),
KeyOperation(KeyOperation),
LaunchRepair(RepairOpt),
Stats(StatsOpt),
// Replies
Ok(String),
@ -55,6 +60,7 @@ impl AdminRpcHandler {
AdminRPC::BucketOperation(bo) => self2.handle_bucket_cmd(bo).await,
AdminRPC::KeyOperation(ko) => self2.handle_key_cmd(ko).await,
AdminRPC::LaunchRepair(opt) => self2.handle_launch_repair(opt).await,
AdminRPC::Stats(opt) => self2.handle_stats(opt).await,
_ => Err(Error::BadRPC(format!("Invalid RPC"))),
}
}
@ -116,7 +122,7 @@ impl AdminRpcHandler {
for (key_id, _, _) in bucket.authorized_keys() {
if let Some(key) = self.garage.key_table.get(&EmptyKey, key_id).await? {
if !key.deleted.get() {
self.update_key_bucket(key, &bucket.name, false, false)
self.update_key_bucket(&key, &bucket.name, false, false)
.await?;
}
} else {
@ -128,31 +134,31 @@ impl AdminRpcHandler {
Ok(AdminRPC::Ok(format!("Bucket {} was deleted.", query.name)))
}
BucketOperation::Allow(query) => {
let key = self.get_existing_key(&query.key_id).await?;
let key = self.get_existing_key(&query.key_pattern).await?;
let bucket = self.get_existing_bucket(&query.bucket).await?;
let allow_read = query.read || key.allow_read(&query.bucket);
let allow_write = query.write || key.allow_write(&query.bucket);
self.update_key_bucket(key, &query.bucket, allow_read, allow_write)
self.update_key_bucket(&key, &query.bucket, allow_read, allow_write)
.await?;
self.update_bucket_key(bucket, &query.key_id, allow_read, allow_write)
self.update_bucket_key(bucket, &key.key_id, allow_read, allow_write)
.await?;
Ok(AdminRPC::Ok(format!(
"New permissions for {} on {}: read {}, write {}.",
&query.key_id, &query.bucket, allow_read, allow_write
&key.key_id, &query.bucket, allow_read, allow_write
)))
}
BucketOperation::Deny(query) => {
let key = self.get_existing_key(&query.key_id).await?;
let key = self.get_existing_key(&query.key_pattern).await?;
let bucket = self.get_existing_bucket(&query.bucket).await?;
let allow_read = !query.read && key.allow_read(&query.bucket);
let allow_write = !query.write && key.allow_write(&query.bucket);
self.update_key_bucket(key, &query.bucket, allow_read, allow_write)
self.update_key_bucket(&key, &query.bucket, allow_read, allow_write)
.await?;
self.update_bucket_key(bucket, &query.key_id, allow_read, allow_write)
self.update_bucket_key(bucket, &key.key_id, allow_read, allow_write)
.await?;
Ok(AdminRPC::Ok(format!(
"New permissions for {} on {}: read {}, write {}.",
&query.key_id, &query.bucket, allow_read, allow_write
&key.key_id, &query.bucket, allow_read, allow_write
)))
}
BucketOperation::Website(query) => {
@ -187,7 +193,12 @@ impl AdminRpcHandler {
let key_ids = self
.garage
.key_table
.get_range(&EmptyKey, None, Some(DeletedFilter::NotDeleted), 10000)
.get_range(
&EmptyKey,
None,
Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)),
10000,
)
.await?
.iter()
.map(|k| (k.key_id.to_string(), k.name.get().clone()))
@ -195,7 +206,7 @@ impl AdminRpcHandler {
Ok(AdminRPC::KeyList(key_ids))
}
KeyOperation::Info(query) => {
let key = self.get_existing_key(&query.key_id).await?;
let key = self.get_existing_key(&query.key_pattern).await?;
Ok(AdminRPC::KeyInfo(key))
}
KeyOperation::New(query) => {
@ -204,13 +215,13 @@ impl AdminRpcHandler {
Ok(AdminRPC::KeyInfo(key))
}
KeyOperation::Rename(query) => {
let mut key = self.get_existing_key(&query.key_id).await?;
let mut key = self.get_existing_key(&query.key_pattern).await?;
key.name.update(query.new_name);
self.garage.key_table.insert(&key).await?;
Ok(AdminRPC::KeyInfo(key))
}
KeyOperation::Delete(query) => {
let key = self.get_existing_key(&query.key_id).await?;
let key = self.get_existing_key(&query.key_pattern).await?;
if !query.yes {
return Err(Error::BadRPC(format!(
"Add --yes flag to really perform this operation"
@ -227,11 +238,11 @@ impl AdminRpcHandler {
return Err(Error::Message(format!("Bucket not found: {}", ab_name)));
}
}
let del_key = Key::delete(key.key_id);
let del_key = Key::delete(key.key_id.to_string());
self.garage.key_table.insert(&del_key).await?;
Ok(AdminRPC::Ok(format!(
"Key {} was deleted successfully.",
query.key_id
key.key_id
)))
}
}
@ -250,14 +261,28 @@ impl AdminRpcHandler {
))))
}
async fn get_existing_key(&self, id: &String) -> Result<Key, Error> {
self.garage
async fn get_existing_key(&self, pattern: &str) -> Result<Key, Error> {
let candidates = self
.garage
.key_table
.get(&EmptyKey, id)
.get_range(
&EmptyKey,
None,
Some(KeyFilter::Matches(pattern.to_string())),
10,
)
.await?
.into_iter()
.filter(|k| !k.deleted.get())
.map(Ok)
.unwrap_or(Err(Error::BadRPC(format!("Key {} does not exist", id))))
.collect::<Vec<_>>();
if candidates.len() != 1 {
Err(Error::Message(format!(
"{} matching keys",
candidates.len()
)))
} else {
Ok(candidates.into_iter().next().unwrap())
}
}
/// Update **bucket table** to inform of the new linked key
@ -290,11 +315,12 @@ impl AdminRpcHandler {
/// Update **key table** to inform of the new linked bucket
async fn update_key_bucket(
&self,
mut key: Key,
key: &Key,
bucket: &String,
allow_read: bool,
allow_write: bool,
) -> Result<(), Error> {
let mut key = key.clone();
let old_map = key.authorized_buckets.take_and_clear();
key.authorized_buckets.merge(&old_map.update_mutator(
bucket.clone(),
@ -350,12 +376,118 @@ impl AdminRpcHandler {
.background
.spawn_worker("Repair worker".into(), move |must_exit| async move {
repair.repair_worker(opt, must_exit).await
})
.await;
});
Ok(AdminRPC::Ok(format!(
"Repair launched on {:?}",
self.garage.system.id
)))
}
}
async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRPC, Error> {
if opt.all_nodes {
let mut ret = String::new();
let ring = self.garage.system.ring.borrow().clone();
for node in ring.config.members.keys() {
let mut opt = opt.clone();
opt.all_nodes = false;
writeln!(&mut ret, "\n======================").unwrap();
writeln!(&mut ret, "Stats for node {:?}:", node).unwrap();
match self
.rpc_client
.call(*node, AdminRPC::Stats(opt), ADMIN_RPC_TIMEOUT)
.await
{
Ok(AdminRPC::Ok(s)) => writeln!(&mut ret, "{}", s).unwrap(),
Ok(x) => writeln!(&mut ret, "Bad answer: {:?}", x).unwrap(),
Err(e) => writeln!(&mut ret, "Error: {}", e).unwrap(),
}
}
Ok(AdminRPC::Ok(ret))
} else {
Ok(AdminRPC::Ok(self.gather_stats_local(opt)?))
}
}
fn gather_stats_local(&self, opt: StatsOpt) -> Result<String, Error> {
let mut ret = String::new();
writeln!(
&mut ret,
"\nGarage version: {}",
git_version::git_version!()
)
.unwrap();
// Gather ring statistics
let ring = self.garage.system.ring.borrow().clone();
let mut ring_nodes = HashMap::new();
for r in ring.ring.iter() {
for n in r.nodes.iter() {
if !ring_nodes.contains_key(n) {
ring_nodes.insert(*n, 0usize);
}
*ring_nodes.get_mut(n).unwrap() += 1;
}
}
writeln!(&mut ret, "\nRing nodes & partition count:").unwrap();
for (n, c) in ring_nodes.iter() {
writeln!(&mut ret, " {:?} {}", n, c).unwrap();
}
self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.key_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.object_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.version_table, &opt)?;
self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt)?;
writeln!(&mut ret, "\nBlock manager stats:").unwrap();
writeln!(
&mut ret,
" number of blocks: {}",
self.garage.block_manager.rc_len()
)
.unwrap();
writeln!(
&mut ret,
" resync queue length: {}",
self.garage.block_manager.resync_queue_len()
)
.unwrap();
if opt.detailed {
writeln!(&mut ret, "\nDetailed stats not implemented yet.").unwrap();
}
Ok(ret)
}
fn gather_table_stats<F, R>(
&self,
to: &mut String,
t: &Arc<Table<F, R>>,
_opt: &StatsOpt,
) -> Result<(), Error>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
writeln!(to, "\nTable stats for {}", t.data.name).unwrap();
writeln!(to, " number of items: {}", t.data.store.len()).unwrap();
writeln!(
to,
" Merkle updater todo queue length: {}",
t.merkle_updater.todo_len()
)
.unwrap();
writeln!(
to,
" Merkle tree size: {}",
t.merkle_updater.merkle_tree_len()
)
.unwrap();
writeln!(to, " GC todo queue length: {}", t.data.gc_todo_len()).unwrap();
Ok(())
}
}

535
src/garage/cli.rs Normal file
View file

@ -0,0 +1,535 @@
use std::collections::HashSet;
use std::net::SocketAddr;
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use garage_util::error::Error;
use garage_util::time::*;
use garage_rpc::membership::*;
use garage_rpc::ring::*;
use garage_rpc::rpc_client::*;
use garage_model::bucket_table::*;
use garage_model::key_table::*;
use crate::admin_rpc::*;
#[derive(StructOpt, Debug)]
pub enum Command {
/// Run Garage server
#[structopt(name = "server")]
Server(ServerOpt),
/// Get network status
#[structopt(name = "status")]
Status,
/// Garage node operations
#[structopt(name = "node")]
Node(NodeOperation),
/// Bucket operations
#[structopt(name = "bucket")]
Bucket(BucketOperation),
/// Key operations
#[structopt(name = "key")]
Key(KeyOperation),
/// Start repair of node data
#[structopt(name = "repair")]
Repair(RepairOpt),
/// Gather node statistics
#[structopt(name = "stats")]
Stats(StatsOpt),
}
#[derive(StructOpt, Debug)]
pub struct ServerOpt {
/// Configuration file
#[structopt(short = "c", long = "config", default_value = "./config.toml")]
pub config_file: PathBuf,
}
#[derive(StructOpt, Debug)]
pub enum NodeOperation {
/// Configure Garage node
#[structopt(name = "configure")]
Configure(ConfigureNodeOpt),
/// Remove Garage node from cluster
#[structopt(name = "remove")]
Remove(RemoveNodeOpt),
}
#[derive(StructOpt, Debug)]
pub struct ConfigureNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// Location (datacenter) of the node
#[structopt(short = "d", long = "datacenter")]
datacenter: Option<String>,
/// Capacity (in relative terms, use 1 to represent your smallest server)
#[structopt(short = "c", long = "capacity")]
capacity: Option<u32>,
/// Optionnal node tag
#[structopt(short = "t", long = "tag")]
tag: Option<String>,
}
#[derive(StructOpt, Debug)]
pub struct RemoveNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// If this flag is not given, the node won't be removed
#[structopt(long = "yes")]
yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum BucketOperation {
/// List buckets
#[structopt(name = "list")]
List,
/// Get bucket info
#[structopt(name = "info")]
Info(BucketOpt),
/// Create bucket
#[structopt(name = "create")]
Create(BucketOpt),
/// Delete bucket
#[structopt(name = "delete")]
Delete(DeleteBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "allow")]
Allow(PermBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "deny")]
Deny(PermBucketOpt),
/// Expose as website or not
#[structopt(name = "website")]
Website(WebsiteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct WebsiteOpt {
/// Create
#[structopt(long = "allow")]
pub allow: bool,
/// Delete
#[structopt(long = "deny")]
pub deny: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct BucketOpt {
/// Bucket name
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct DeleteBucketOpt {
/// Bucket name
pub name: String,
/// If this flag is not given, the bucket won't be deleted
#[structopt(long = "yes")]
pub yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct PermBucketOpt {
/// Access key name or ID
#[structopt(long = "key")]
pub key_pattern: String,
/// Allow/deny read operations
#[structopt(long = "read")]
pub read: bool,
/// Allow/deny write operations
#[structopt(long = "write")]
pub write: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum KeyOperation {
/// List keys
#[structopt(name = "list")]
List,
/// Get key info
#[structopt(name = "info")]
Info(KeyOpt),
/// Create new key
#[structopt(name = "new")]
New(KeyNewOpt),
/// Rename key
#[structopt(name = "rename")]
Rename(KeyRenameOpt),
/// Delete key
#[structopt(name = "delete")]
Delete(KeyDeleteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyOpt {
/// ID or name of the key
pub key_pattern: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyNewOpt {
/// Name of the key
#[structopt(long = "name", default_value = "Unnamed key")]
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyRenameOpt {
/// ID or name of the key
pub key_pattern: String,
/// New name of the key
pub new_name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyDeleteOpt {
/// ID or name of the key
pub key_pattern: String,
/// Confirm deletion
#[structopt(long = "yes")]
pub yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct RepairOpt {
/// Launch repair operation on all nodes
#[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool,
/// Confirm the launch of the repair operation
#[structopt(long = "yes")]
pub yes: bool,
#[structopt(subcommand)]
pub what: Option<RepairWhat>,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum RepairWhat {
/// Only do a full sync of metadata tables
#[structopt(name = "tables")]
Tables,
/// Only repair (resync/rebalance) the set of stored blocks
#[structopt(name = "blocks")]
Blocks,
/// Only redo the propagation of object deletions to the version table (slow)
#[structopt(name = "versions")]
Versions,
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
#[structopt(name = "block_refs")]
BlockRefs,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct StatsOpt {
/// Gather statistics from all nodes
#[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool,
/// Gather detailed statistics (this can be long)
#[structopt(short = "d", long = "detailed")]
pub detailed: bool,
}
pub async fn cli_cmd(
cmd: Command,
membership_rpc_cli: RpcAddrClient<Message>,
admin_rpc_cli: RpcAddrClient<AdminRPC>,
rpc_host: SocketAddr,
) -> Result<(), Error> {
match cmd {
Command::Status => cmd_status(membership_rpc_cli, rpc_host).await,
Command::Node(NodeOperation::Configure(configure_opt)) => {
cmd_configure(membership_rpc_cli, rpc_host, configure_opt).await
}
Command::Node(NodeOperation::Remove(remove_opt)) => {
cmd_remove(membership_rpc_cli, rpc_host, remove_opt).await
}
Command::Bucket(bo) => {
cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::BucketOperation(bo)).await
}
Command::Key(ko) => cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::KeyOperation(ko)).await,
Command::Repair(ro) => cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::LaunchRepair(ro)).await,
Command::Stats(so) => cmd_admin(admin_rpc_cli, rpc_host, AdminRPC::Stats(so)).await,
_ => unreachable!(),
}
}
pub async fn cmd_status(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
println!("Healthy nodes:");
for adv in status.iter().filter(|x| x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}",
adv.id, adv.state_info.hostname, adv.addr, cfg.tag, cfg.datacenter, cfg.capacity
);
} else {
println!(
"{:?}\t{}\t{}\tUNCONFIGURED/REMOVED",
adv.id, adv.state_info.hostname, adv.addr
);
}
}
let status_keys = status.iter().map(|x| x.id).collect::<HashSet<_>>();
let failure_case_1 = status.iter().any(|x| !x.is_up);
let failure_case_2 = config
.members
.iter()
.any(|(id, _)| !status_keys.contains(id));
if failure_case_1 || failure_case_2 {
println!("\nFailed nodes:");
for adv in status.iter().filter(|x| !x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}\tlast seen: {}s ago",
adv.id,
adv.state_info.hostname,
adv.addr,
cfg.tag,
cfg.datacenter,
cfg.capacity,
(now_msec() - adv.last_seen) / 1000,
);
}
}
for (id, cfg) in config.members.iter() {
if !status.iter().any(|x| x.id == *id) {
println!(
"{:?}\t{}\t{}\t{}\tnever seen",
id, cfg.tag, cfg.datacenter, cfg.capacity
);
}
}
}
Ok(())
}
pub async fn cmd_configure(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: ConfigureNodeOpt,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for adv in status.iter() {
if hex::encode(&adv.id).starts_with(&args.node_id) {
candidates.push(adv.id);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let new_entry = match config.members.get(&candidates[0]) {
None => NetworkConfigEntry {
datacenter: args
.datacenter
.expect("Please specifiy a datacenter with the -d flag"),
capacity: args
.capacity
.expect("Please specifiy a capacity with the -c flag"),
tag: args.tag.unwrap_or("".to_string()),
},
Some(old) => NetworkConfigEntry {
datacenter: args.datacenter.unwrap_or(old.datacenter.to_string()),
capacity: args.capacity.unwrap_or(old.capacity),
tag: args.tag.unwrap_or(old.tag.to_string()),
},
};
config.members.insert(candidates[0].clone(), new_entry);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
pub async fn cmd_remove(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: RemoveNodeOpt,
) -> Result<(), Error> {
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for (key, _) in config.members.iter() {
if hex::encode(key).starts_with(&args.node_id) {
candidates.push(*key);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
if !args.yes {
return Err(Error::Message(format!(
"Add the flag --yes to really remove {:?} from the cluster",
candidates[0]
)));
}
config.members.remove(&candidates[0]);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
pub async fn cmd_admin(
rpc_cli: RpcAddrClient<AdminRPC>,
rpc_host: SocketAddr,
args: AdminRPC,
) -> Result<(), Error> {
match rpc_cli.call(&rpc_host, args, ADMIN_RPC_TIMEOUT).await?? {
AdminRPC::Ok(msg) => {
println!("{}", msg);
}
AdminRPC::BucketList(bl) => {
println!("List of buckets:");
for bucket in bl {
println!("{}", bucket);
}
}
AdminRPC::BucketInfo(bucket) => {
print_bucket_info(&bucket);
}
AdminRPC::KeyList(kl) => {
println!("List of keys:");
for key in kl {
println!("{}\t{}", key.0, key.1);
}
}
AdminRPC::KeyInfo(key) => {
print_key_info(&key);
}
r => {
error!("Unexpected response: {:?}", r);
}
}
Ok(())
}
fn print_key_info(key: &Key) {
println!("Key name: {}", key.name.get());
println!("Key ID: {}", key.key_id);
println!("Secret key: {}", key.secret_key);
if key.deleted.get() {
println!("Key is deleted.");
} else {
println!("Authorized buckets:");
for (b, _, perm) in key.authorized_buckets.items().iter() {
println!("- {} R:{} W:{}", b, perm.allow_read, perm.allow_write);
}
}
}
fn print_bucket_info(bucket: &Bucket) {
println!("Bucket name: {}", bucket.name);
match bucket.state.get() {
BucketState::Deleted => println!("Bucket is deleted."),
BucketState::Present(p) => {
println!("Authorized keys:");
for (k, _, perm) in p.authorized_keys.items().iter() {
println!("- {} R:{} W:{}", k, perm.allow_read, perm.allow_write);
}
println!("Website access: {}", p.website.get());
}
};
}

View file

@ -4,289 +4,67 @@
extern crate log;
mod admin_rpc;
mod cli;
mod repair;
mod server;
use std::collections::HashSet;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use garage_util::config::TlsConfig;
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::membership::*;
use garage_rpc::ring::*;
use garage_rpc::rpc_client::*;
use admin_rpc::*;
use cli::*;
#[derive(StructOpt, Debug)]
#[structopt(name = "garage")]
pub struct Opt {
/// RPC connect to this host to execute client operations
#[structopt(short = "h", long = "rpc-host", default_value = "127.0.0.1:3901")]
rpc_host: SocketAddr,
pub rpc_host: SocketAddr,
#[structopt(long = "ca-cert")]
ca_cert: Option<String>,
pub ca_cert: Option<String>,
#[structopt(long = "client-cert")]
client_cert: Option<String>,
pub client_cert: Option<String>,
#[structopt(long = "client-key")]
client_key: Option<String>,
pub client_key: Option<String>,
#[structopt(subcommand)]
cmd: Command,
}
#[derive(StructOpt, Debug)]
pub enum Command {
/// Run Garage server
#[structopt(name = "server")]
Server(ServerOpt),
/// Get network status
#[structopt(name = "status")]
Status,
/// Garage node operations
#[structopt(name = "node")]
Node(NodeOperation),
/// Bucket operations
#[structopt(name = "bucket")]
Bucket(BucketOperation),
/// Key operations
#[structopt(name = "key")]
Key(KeyOperation),
/// Start repair of node data
#[structopt(name = "repair")]
Repair(RepairOpt),
}
#[derive(StructOpt, Debug)]
pub struct ServerOpt {
/// Configuration file
#[structopt(short = "c", long = "config", default_value = "./config.toml")]
config_file: PathBuf,
}
#[derive(StructOpt, Debug)]
pub enum NodeOperation {
/// Configure Garage node
#[structopt(name = "configure")]
Configure(ConfigureNodeOpt),
/// Remove Garage node from cluster
#[structopt(name = "remove")]
Remove(RemoveNodeOpt),
}
#[derive(StructOpt, Debug)]
pub struct ConfigureNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// Location (datacenter) of the node
#[structopt(short = "d", long = "datacenter")]
datacenter: Option<String>,
/// Capacity (in relative terms, use 1 to represent your smallest server)
#[structopt(short = "c", long = "capacity")]
capacity: Option<u32>,
/// Optionnal node tag
#[structopt(short = "t", long = "tag")]
tag: Option<String>,
}
#[derive(StructOpt, Debug)]
pub struct RemoveNodeOpt {
/// Node to configure (prefix of hexadecimal node id)
node_id: String,
/// If this flag is not given, the node won't be removed
#[structopt(long = "yes")]
yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum BucketOperation {
/// List buckets
#[structopt(name = "list")]
List,
/// Get bucket info
#[structopt(name = "info")]
Info(BucketOpt),
/// Create bucket
#[structopt(name = "create")]
Create(BucketOpt),
/// Delete bucket
#[structopt(name = "delete")]
Delete(DeleteBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "allow")]
Allow(PermBucketOpt),
/// Allow key to read or write to bucket
#[structopt(name = "deny")]
Deny(PermBucketOpt),
/// Expose as website or not
#[structopt(name = "website")]
Website(WebsiteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct WebsiteOpt {
/// Create
#[structopt(long = "allow")]
pub allow: bool,
/// Delete
#[structopt(long = "deny")]
pub deny: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct BucketOpt {
/// Bucket name
pub name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct DeleteBucketOpt {
/// Bucket name
pub name: String,
/// If this flag is not given, the bucket won't be deleted
#[structopt(long = "yes")]
pub yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct PermBucketOpt {
/// Access key ID
#[structopt(long = "key")]
pub key_id: String,
/// Allow/deny read operations
#[structopt(long = "read")]
pub read: bool,
/// Allow/deny write operations
#[structopt(long = "write")]
pub write: bool,
/// Bucket name
pub bucket: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum KeyOperation {
/// List keys
#[structopt(name = "list")]
List,
/// Get key info
#[structopt(name = "info")]
Info(KeyOpt),
/// Create new key
#[structopt(name = "new")]
New(KeyNewOpt),
/// Rename key
#[structopt(name = "rename")]
Rename(KeyRenameOpt),
/// Delete key
#[structopt(name = "delete")]
Delete(KeyDeleteOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyOpt {
/// ID of the key
key_id: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyNewOpt {
/// Name of the key
#[structopt(long = "name", default_value = "Unnamed key")]
name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyRenameOpt {
/// ID of the key
key_id: String,
/// New name of the key
new_name: String,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub struct KeyDeleteOpt {
/// ID of the key
key_id: String,
/// Confirm deletion
#[structopt(long = "yes")]
yes: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct RepairOpt {
/// Launch repair operation on all nodes
#[structopt(short = "a", long = "all-nodes")]
pub all_nodes: bool,
/// Confirm the launch of the repair operation
#[structopt(long = "yes")]
pub yes: bool,
#[structopt(subcommand)]
pub what: Option<RepairWhat>,
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum RepairWhat {
/// Only do a full sync of metadata tables
#[structopt(name = "tables")]
Tables,
/// Only repair (resync/rebalance) the set of stored blocks
#[structopt(name = "blocks")]
Blocks,
/// Only redo the propagation of object deletions to the version table (slow)
#[structopt(name = "versions")]
Versions,
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
#[structopt(name = "block_refs")]
BlockRefs,
}
#[tokio::main]
async fn main() {
pretty_env_logger::init();
let opt = Opt::from_args();
let res = if let Command::Server(server_opt) = opt.cmd {
// Abort on panic (same behavior as in Go)
std::panic::set_hook(Box::new(|panic_info| {
error!("{}", panic_info.to_string());
std::process::abort();
}));
server::run_server(server_opt.config_file).await
} else {
cli_command(opt).await
};
if let Err(e) = res {
error!("{}", e);
}
}
async fn cli_command(opt: Opt) -> Result<(), Error> {
let tls_config = match (opt.ca_cert, opt.client_cert, opt.client_key) {
(Some(ca_cert), Some(client_cert), Some(client_key)) => Some(TlsConfig {
ca_cert,
@ -306,245 +84,5 @@ async fn main() {
RpcAddrClient::new(rpc_http_cli.clone(), MEMBERSHIP_RPC_PATH.to_string());
let admin_rpc_cli = RpcAddrClient::new(rpc_http_cli.clone(), ADMIN_RPC_PATH.to_string());
let resp = match opt.cmd {
Command::Server(server_opt) => {
// Abort on panic (same behavior as in Go)
std::panic::set_hook(Box::new(|panic_info| {
error!("{}", panic_info.to_string());
std::process::abort();
}));
server::run_server(server_opt.config_file).await
}
Command::Status => cmd_status(membership_rpc_cli, opt.rpc_host).await,
Command::Node(NodeOperation::Configure(configure_opt)) => {
cmd_configure(membership_rpc_cli, opt.rpc_host, configure_opt).await
}
Command::Node(NodeOperation::Remove(remove_opt)) => {
cmd_remove(membership_rpc_cli, opt.rpc_host, remove_opt).await
}
Command::Bucket(bo) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::BucketOperation(bo)).await
}
Command::Key(bo) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::KeyOperation(bo)).await
}
Command::Repair(ro) => {
cmd_admin(admin_rpc_cli, opt.rpc_host, AdminRPC::LaunchRepair(ro)).await
}
};
if let Err(e) = resp {
error!("Error: {}", e);
}
}
async fn cmd_status(rpc_cli: RpcAddrClient<Message>, rpc_host: SocketAddr) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
println!("Healthy nodes:");
for adv in status.iter().filter(|x| x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}",
adv.id, adv.state_info.hostname, adv.addr, cfg.tag, cfg.datacenter, cfg.capacity
);
} else {
println!(
"{:?}\t{}\t{}\tUNCONFIGURED/REMOVED",
adv.id, adv.state_info.hostname, adv.addr
);
}
}
let status_keys = status.iter().map(|x| x.id).collect::<HashSet<_>>();
let failure_case_1 = status.iter().any(|x| !x.is_up);
let failure_case_2 = config
.members
.iter()
.any(|(id, _)| !status_keys.contains(id));
if failure_case_1 || failure_case_2 {
println!("\nFailed nodes:");
for adv in status.iter().filter(|x| !x.is_up) {
if let Some(cfg) = config.members.get(&adv.id) {
println!(
"{:?}\t{}\t{}\t[{}]\t{}\t{}\tlast seen: {}s ago",
adv.id,
adv.state_info.hostname,
adv.addr,
cfg.tag,
cfg.datacenter,
cfg.capacity,
(now_msec() - adv.last_seen) / 1000,
);
}
}
for (id, cfg) in config.members.iter() {
if !status.iter().any(|x| x.id == *id) {
println!(
"{:?}\t{}\t{}\t{}\tnever seen",
id, cfg.tag, cfg.datacenter, cfg.capacity
);
}
}
}
Ok(())
}
async fn cmd_configure(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: ConfigureNodeOpt,
) -> Result<(), Error> {
let status = match rpc_cli
.call(&rpc_host, &Message::PullStatus, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseNodesUp(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for adv in status.iter() {
if hex::encode(&adv.id).starts_with(&args.node_id) {
candidates.push(adv.id);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let new_entry = match config.members.get(&candidates[0]) {
None => NetworkConfigEntry {
datacenter: args
.datacenter
.expect("Please specifiy a datacenter with the -d flag"),
capacity: args
.capacity
.expect("Please specifiy a capacity with the -c flag"),
tag: args.tag.unwrap_or("".to_string()),
},
Some(old) => NetworkConfigEntry {
datacenter: args.datacenter.unwrap_or(old.datacenter.to_string()),
capacity: args.capacity.unwrap_or(old.capacity),
tag: args.tag.unwrap_or(old.tag.to_string()),
},
};
config.members.insert(candidates[0].clone(), new_entry);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
async fn cmd_remove(
rpc_cli: RpcAddrClient<Message>,
rpc_host: SocketAddr,
args: RemoveNodeOpt,
) -> Result<(), Error> {
let mut config = match rpc_cli
.call(&rpc_host, &Message::PullConfig, ADMIN_RPC_TIMEOUT)
.await??
{
Message::AdvertiseConfig(cfg) => cfg,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
let mut candidates = vec![];
for (key, _) in config.members.iter() {
if hex::encode(key).starts_with(&args.node_id) {
candidates.push(*key);
}
}
if candidates.len() != 1 {
return Err(Error::Message(format!(
"{} matching nodes",
candidates.len()
)));
}
if !args.yes {
return Err(Error::Message(format!(
"Add the flag --yes to really remove {:?} from the cluster",
candidates[0]
)));
}
config.members.remove(&candidates[0]);
config.version += 1;
rpc_cli
.call(
&rpc_host,
&Message::AdvertiseConfig(config),
ADMIN_RPC_TIMEOUT,
)
.await??;
Ok(())
}
async fn cmd_admin(
rpc_cli: RpcAddrClient<AdminRPC>,
rpc_host: SocketAddr,
args: AdminRPC,
) -> Result<(), Error> {
match rpc_cli.call(&rpc_host, args, ADMIN_RPC_TIMEOUT).await?? {
AdminRPC::Ok(msg) => {
println!("{}", msg);
}
AdminRPC::BucketList(bl) => {
println!("List of buckets:");
for bucket in bl {
println!("{}", bucket);
}
}
AdminRPC::BucketInfo(bucket) => {
println!("{:?}", bucket);
}
AdminRPC::KeyList(kl) => {
println!("List of keys:");
for key in kl {
println!("{}\t{}", key.0, key.1);
}
}
AdminRPC::KeyInfo(key) => {
println!("{:?}", key);
}
r => {
error!("Unexpected response: {:?}", r);
}
}
Ok(())
cli_cmd(opt.cmd, membership_rpc_cli, admin_rpc_cli, opt.rpc_host).await
}

View file

@ -16,7 +16,13 @@ pub struct Repair {
}
impl Repair {
pub async fn repair_worker(
pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver<bool>) {
if let Err(e) = self.repair_worker_aux(opt, must_exit).await {
warn!("Repair worker failed with error: {}", e);
}
}
async fn repair_worker_aux(
&self,
opt: RepairOpt,
must_exit: watch::Receiver<bool>,
@ -25,41 +31,11 @@ impl Repair {
if todo(RepairWhat::Tables) {
info!("Launching a full sync of tables");
self.garage
.bucket_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.object_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.version_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.block_ref_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage
.key_table
.syncer
.load_full()
.unwrap()
.add_full_scan()
.await;
self.garage.bucket_table.syncer.add_full_sync();
self.garage.object_table.syncer.add_full_sync();
self.garage.version_table.syncer.add_full_sync();
self.garage.block_ref_table.syncer.add_full_sync();
self.garage.key_table.syncer.add_full_sync();
}
// TODO: wait for full sync to finish before proceeding to the rest?
@ -93,11 +69,13 @@ impl Repair {
async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
let mut pos = vec![];
while let Some((item_key, item_bytes)) = self.garage.version_table.store.get_gt(&pos)? {
while let Some((item_key, item_bytes)) =
self.garage.version_table.data.store.get_gt(&pos)?
{
pos = item_key.to_vec();
let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
if version.deleted {
if version.deleted.get() {
continue;
}
let object = self
@ -110,13 +88,7 @@ impl Repair {
.versions()
.iter()
.any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted),
None => {
warn!(
"Repair versions: object for version {:?} not found, skipping.",
version
);
continue;
}
None => false,
};
if !version_exists {
info!("Repair versions: marking version as deleted: {:?}", version);
@ -127,7 +99,6 @@ impl Repair {
version.bucket,
version.key,
true,
vec![],
))
.await?;
}
@ -142,11 +113,13 @@ impl Repair {
async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
let mut pos = vec![];
while let Some((item_key, item_bytes)) = self.garage.block_ref_table.store.get_gt(&pos)? {
while let Some((item_key, item_bytes)) =
self.garage.block_ref_table.data.store.get_gt(&pos)?
{
pos = item_key.to_vec();
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;
if block_ref.deleted {
if block_ref.deleted.get() {
continue;
}
let version = self
@ -154,16 +127,8 @@ impl Repair {
.version_table
.get(&block_ref.version, &EmptyKey)
.await?;
let ref_exists = match version {
Some(v) => !v.deleted,
None => {
warn!(
"Block ref repair: version for block ref {:?} not found, skipping.",
block_ref
);
continue;
}
};
// The version might not exist if it has been GC'ed
let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false);
if !ref_exists {
info!(
"Repair block ref: marking block_ref as deleted: {:?}",
@ -174,7 +139,7 @@ impl Repair {
.insert(&BlockRef {
block: block_ref.block,
version: block_ref.version,
deleted: true,
deleted: true.into(),
})
.await?;
}

View file

@ -21,13 +21,13 @@ async fn shutdown_signal(send_cancel: watch::Sender<bool>) -> Result<(), Error>
.await
.expect("failed to install CTRL+C signal handler");
info!("Received CTRL+C, shutting down.");
send_cancel.broadcast(true)?;
send_cancel.send(true)?;
Ok(())
}
async fn wait_from(mut chan: watch::Receiver<bool>) -> () {
while let Some(exit_now) = chan.recv().await {
if exit_now {
while !*chan.borrow() {
if chan.changed().await.is_err() {
return;
}
}
@ -40,37 +40,22 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Opening database...");
let mut db_path = config.metadata_dir.clone();
db_path.push("db");
let db = match sled::open(&db_path) {
Ok(db) => db,
Err(e) => {
warn!("Old DB could not be openned ({}), attempting migration.", e);
let old = old_sled::open(&db_path).expect("Unable to open old DB for migration");
let mut new_path = config.metadata_dir.clone();
new_path.push("db2");
let new = sled::open(&new_path).expect("Unable to open new DB for migration");
new.import(old.export());
if old.checksum().expect("unable to compute old db checksum")
!= new.checksum().expect("unable to compute new db checksum")
{
panic!("db checksums don't match after migration");
}
drop(new);
drop(old);
std::fs::remove_dir_all(&db_path).expect("Cannot remove old DB folder");
std::fs::rename(new_path, &db_path)
.expect("Cannot move new DB folder to correct place");
sled::open(db_path).expect("Unable to open new DB after migration")
}
};
let db = sled::open(&db_path).expect("Unable to open sled DB");
info!("Initialize RPC server...");
let mut rpc_server = RpcServer::new(config.rpc_bind_addr.clone(), config.rpc_tls.clone());
info!("Initializing background runner...");
let (send_cancel, watch_cancel) = watch::channel(false);
let background = BackgroundRunner::new(16, watch_cancel.clone());
let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
let garage = Garage::new(config, db, background.clone(), &mut rpc_server).await;
info!("Initializing Garage main data store...");
let garage = Garage::new(config.clone(), db, background, &mut rpc_server);
let bootstrap = garage.system.clone().bootstrap(
&config.bootstrap_peers[..],
config.consul_host,
config.consul_service_name,
);
info!("Crate admin RPC handler...");
AdminRpcHandler::new(garage.clone()).register_handler(&mut rpc_server);
@ -78,21 +63,13 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Initializing RPC and API servers...");
let run_rpc_server = Arc::new(rpc_server).run(wait_from(watch_cancel.clone()));
let api_server = api_server::run_api_server(garage.clone(), wait_from(watch_cancel.clone()));
let web_server = web_server::run_web_server(garage.clone(), wait_from(watch_cancel.clone()));
let web_server = web_server::run_web_server(garage, wait_from(watch_cancel.clone()));
futures::try_join!(
garage
.system
.clone()
.bootstrap(
&garage.config.bootstrap_peers[..],
garage.config.consul_host.clone(),
garage.config.consul_service_name.clone()
)
.map(|rv| {
info!("Bootstrap done");
Ok(rv)
}),
bootstrap.map(|rv| {
info!("Bootstrap done");
Ok(rv)
}),
run_rpc_server.map(|rv| {
info!("RPC server exited");
rv
@ -105,9 +82,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Web server exited");
rv
}),
background.run().map(|rv| {
info!("Background runner exited");
Ok(rv)
await_background_done.map(|rv| {
info!("Background runner exited: {:?}", rv);
Ok(())
}),
shutdown_signal(send_cancel),
)?;

View file

@ -16,23 +16,18 @@ path = "lib.rs"
garage_util = { version = "0.1.1", path = "../util" }
garage_rpc = { version = "0.1.1", path = "../rpc" }
garage_table = { version = "0.1.1", path = "../table" }
model010 = { package = "garage_model_010b", version = "0.0.1" }
bytes = "0.4"
rand = "0.7"
hex = "0.3"
sha2 = "0.8"
arc-swap = "0.4"
rand = "0.8"
hex = "0.4"
arc-swap = "1.0"
log = "0.4"
sled = "0.34"
rmp-serde = "0.14.3"
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11"
async-trait = "0.1.30"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

View file

@ -5,22 +5,20 @@ use std::time::Duration;
use arc_swap::ArcSwapOption;
use futures::future::*;
use futures::select;
use futures::stream::*;
use serde::{Deserialize, Serialize};
use tokio::fs;
use tokio::prelude::*;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::sync::{watch, Mutex, Notify};
use garage_util::data;
use garage_util::data::*;
use garage_util::error::Error;
use garage_util::time::*;
use garage_rpc::membership::System;
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
use garage_table::table_sharded::TableShardedReplication;
use garage_table::TableReplication;
use garage_table::replication::{sharded::TableShardedReplication, TableReplication};
use crate::block_ref_table::*;
@ -28,7 +26,10 @@ use crate::garage::Garage;
pub const INLINE_THRESHOLD: usize = 3072;
pub const BACKGROUND_WORKERS: u64 = 1;
const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(42);
const BLOCK_GC_TIMEOUT: Duration = Duration::from_secs(60);
const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
const RESYNC_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
@ -56,14 +57,14 @@ pub struct BlockManager {
pub data_dir: PathBuf,
pub data_dir_lock: Mutex<()>,
pub rc: sled::Tree,
rc: sled::Tree,
pub resync_queue: sled::Tree,
pub resync_notify: Notify,
resync_queue: sled::Tree,
resync_notify: Notify,
pub system: Arc<System>,
system: Arc<System>,
rpc_client: Arc<RpcClient<Message>>,
pub garage: ArcSwapOption<Garage>,
pub(crate) garage: ArcSwapOption<Garage>,
}
impl BlockManager {
@ -77,7 +78,6 @@ impl BlockManager {
let rc = db
.open_tree("block_local_rc")
.expect("Unable to open block_local_rc tree");
rc.set_merge_operator(rc_merge);
let resync_queue = db
.open_tree("block_local_resync_queue")
@ -127,18 +127,16 @@ impl BlockManager {
}
}
pub async fn spawn_background_worker(self: Arc<Self>) {
pub fn spawn_background_worker(self: Arc<Self>) {
// Launch 2 simultaneous workers for background resync loop preprocessing
for i in 0..2usize {
for i in 0..BACKGROUND_WORKERS {
let bm2 = self.clone();
let background = self.system.background.clone();
tokio::spawn(async move {
tokio::time::delay_for(Duration::from_secs(10)).await;
background
.spawn_worker(format!("block resync worker {}", i), move |must_exit| {
bm2.resync_loop(must_exit)
})
.await;
tokio::time::sleep(Duration::from_secs(10 * (i + 1))).await;
background.spawn_worker(format!("block resync worker {}", i), move |must_exit| {
bm2.resync_loop(must_exit)
});
});
}
}
@ -168,7 +166,7 @@ impl BlockManager {
Ok(f) => f,
Err(e) => {
// Not found but maybe we should have had it ??
self.put_to_resync(hash, 0)?;
self.put_to_resync(hash, Duration::from_millis(0))?;
return Err(Into::into(e));
}
};
@ -176,11 +174,16 @@ impl BlockManager {
f.read_to_end(&mut data).await?;
drop(f);
if data::sha256sum(&data[..]) != *hash {
if blake2sum(&data[..]) != *hash {
let _lock = self.data_dir_lock.lock().await;
warn!("Block {:?} is corrupted. Deleting and resyncing.", hash);
fs::remove_file(path).await?;
self.put_to_resync(&hash, 0)?;
warn!(
"Block {:?} is corrupted. Renaming to .corrupted and resyncing.",
hash
);
let mut path2 = path.clone();
path2.set_extension(".corrupted");
fs::rename(path, path2).await?;
self.put_to_resync(&hash, Duration::from_millis(0))?;
return Err(Error::CorruptData(*hash));
}
@ -191,7 +194,7 @@ impl BlockManager {
let needed = self
.rc
.get(hash.as_ref())?
.map(|x| u64_from_bytes(x.as_ref()) > 0)
.map(|x| u64_from_be_bytes(x) > 0)
.unwrap_or(false);
if needed {
let path = self.block_path(hash);
@ -215,84 +218,95 @@ impl BlockManager {
}
pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
let old_rc = self.rc.get(&hash)?;
self.rc.merge(&hash, vec![1])?;
if old_rc.map(|x| u64_from_bytes(&x[..]) == 0).unwrap_or(true) {
self.put_to_resync(&hash, BLOCK_RW_TIMEOUT.as_millis() as u64)?;
let old_rc = self.rc.fetch_and_update(&hash, |old| {
let old_v = old.map(u64_from_be_bytes).unwrap_or(0);
Some(u64::to_be_bytes(old_v + 1).to_vec())
})?;
let old_rc = old_rc.map(u64_from_be_bytes).unwrap_or(0);
if old_rc == 0 {
self.put_to_resync(&hash, BLOCK_RW_TIMEOUT)?;
}
Ok(())
}
pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
let new_rc = self.rc.merge(&hash, vec![0])?;
if new_rc.map(|x| u64_from_bytes(&x[..]) == 0).unwrap_or(true) {
self.put_to_resync(&hash, 0)?;
let new_rc = self.rc.update_and_fetch(&hash, |old| {
let old_v = old.map(u64_from_be_bytes).unwrap_or(0);
if old_v > 1 {
Some(u64::to_be_bytes(old_v - 1).to_vec())
} else {
None
}
})?;
if new_rc.is_none() {
self.put_to_resync(&hash, BLOCK_GC_TIMEOUT)?;
}
Ok(())
}
fn put_to_resync(&self, hash: &Hash, delay_millis: u64) -> Result<(), Error> {
let when = now_msec() + delay_millis;
fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), Error> {
let when = now_msec() + delay.as_millis() as u64;
trace!("Put resync_queue: {} {:?}", when, hash);
let mut key = u64::to_be_bytes(when).to_vec();
key.extend(hash.as_ref());
self.resync_queue.insert(key, hash.as_ref())?;
self.resync_notify.notify();
self.resync_notify.notify_waiters();
Ok(())
}
async fn resync_loop(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let mut n_failures = 0usize;
async fn resync_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() {
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? {
let time_msec = u64_from_bytes(&time_bytes[0..8]);
let now = now_msec();
if now >= time_msec {
let mut hash = [0u8; 32];
hash.copy_from_slice(hash_bytes.as_ref());
let hash = Hash::from(hash);
if let Err(e) = self.resync_iter(&hash).await {
warn!("Failed to resync block {:?}, retrying later: {}", hash, e);
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT.as_millis() as u64)?;
n_failures += 1;
if n_failures >= 10 {
warn!("Too many resync failures, throttling.");
tokio::time::delay_for(Duration::from_secs(1)).await;
}
} else {
n_failures = 0;
}
} else {
self.resync_queue.insert(time_bytes, hash_bytes)?;
let delay = tokio::time::delay_for(Duration::from_millis(time_msec - now));
select! {
_ = delay.fuse() => (),
_ = self.resync_notify.notified().fuse() => (),
_ = must_exit.recv().fuse() => (),
}
}
} else {
if let Err(e) = self.resync_iter(&mut must_exit).await {
warn!("Error in block resync loop: {}", e);
select! {
_ = self.resync_notify.notified().fuse() => (),
_ = must_exit.recv().fuse() => (),
_ = tokio::time::sleep(Duration::from_secs(10)).fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
}
}
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<(), Error> {
if let Some(first_item) = self.resync_queue.iter().next() {
let (time_bytes, hash_bytes) = first_item?;
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
let now = now_msec();
if now >= time_msec {
let hash = Hash::try_from(&hash_bytes[..]).unwrap();
let res = self.resync_block(&hash).await;
if let Err(e) = &res {
warn!("Error when resyncing {:?}: {}", hash, e);
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
}
self.resync_queue.remove(&time_bytes)?;
res?; // propagate error to delay main loop
} else {
let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
select! {
_ = delay.fuse() => (),
_ = self.resync_notify.notified().fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
} else {
select! {
_ = self.resync_notify.notified().fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
Ok(())
}
async fn resync_iter(&self, hash: &Hash) -> Result<(), Error> {
async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
let lock = self.data_dir_lock.lock().await;
let path = self.block_path(hash);
let exists = fs::metadata(&path).await.is_ok();
let needed = self
.rc
.get(hash.as_ref())?
.map(|x| u64_from_bytes(x.as_ref()) > 0)
.map(|x| u64_from_be_bytes(x) > 0)
.unwrap_or(false);
if exists != needed {
@ -305,9 +319,10 @@ impl BlockManager {
if exists && !needed {
trace!("Offloading block {:?}", hash);
let ring = self.system.ring.borrow().clone();
let mut who = self.replication.replication_nodes(&hash, &ring);
let mut who = self.replication.write_nodes(&hash);
if who.len() < self.replication.write_quorum() {
return Err(Error::Message(format!("Not trying to offload block because we don't have a quorum of nodes to write to")));
}
who.retain(|id| *id != self.system.id);
let msg = Arc::new(Message::NeedBlockQuery(*hash));
@ -340,17 +355,17 @@ impl BlockManager {
need_nodes.len()
);
let put_block_message = Arc::new(self.read_block(hash).await?);
let put_resps = join_all(need_nodes.iter().map(|to| {
self.rpc_client
.call_arc(*to, put_block_message.clone(), BLOCK_RW_TIMEOUT)
}))
.await;
for resp in put_resps {
resp?;
}
let put_block_message = self.read_block(hash).await?;
self.rpc_client
.try_call_many(
&need_nodes[..],
put_block_message,
RequestStrategy::with_quorum(need_nodes.len())
.with_timeout(BLOCK_RW_TIMEOUT),
)
.await?;
}
trace!(
info!(
"Deleting block {:?}, offload finished ({} / {})",
hash,
need_nodes.len(),
@ -358,10 +373,11 @@ impl BlockManager {
);
fs::remove_file(path).await?;
self.resync_queue.remove(&hash)?;
}
if needed && !exists {
drop(lock);
// TODO find a way to not do this if they are sending it to us
// Let's suppose this isn't an issue for now with the BLOCK_RW_TIMEOUT delay
// between the RC being incremented and this part being called.
@ -373,7 +389,7 @@ impl BlockManager {
}
pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
let who = self.replication.read_nodes(&hash, &self.system);
let who = self.replication.read_nodes(&hash);
let resps = self
.rpc_client
.try_call_many(
@ -397,12 +413,12 @@ impl BlockManager {
}
pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
let who = self.replication.write_nodes(&hash, &self.system);
let who = self.replication.write_nodes(&hash);
self.rpc_client
.try_call_many(
&who[..],
Message::PutBlock(PutBlockMessage { hash, data }),
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system))
RequestStrategy::with_quorum(self.replication.write_quorum())
.with_timeout(BLOCK_RW_TIMEOUT),
)
.await?;
@ -414,15 +430,15 @@ impl BlockManager {
let garage = self.garage.load_full().unwrap();
let mut last_hash = None;
let mut i = 0usize;
for entry in garage.block_ref_table.store.iter() {
for entry in garage.block_ref_table.data.store.iter() {
let (_k, v_bytes) = entry?;
let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(v_bytes.as_ref())?;
if Some(&block_ref.block) == last_hash.as_ref() {
continue;
}
if !block_ref.deleted {
if !block_ref.deleted.get() {
last_hash = Some(block_ref.block);
self.put_to_resync(&block_ref.block, 0)?;
self.put_to_resync(&block_ref.block, Duration::from_secs(0))?;
}
i += 1;
if i & 0xFF == 0 && *must_exit.borrow() {
@ -447,8 +463,12 @@ impl BlockManager {
// so that we can offload them if necessary and then delete them locally.
async move {
let mut ls_data_dir = fs::read_dir(path).await?;
while let Some(data_dir_ent) = ls_data_dir.next().await {
let data_dir_ent = data_dir_ent?;
loop {
let data_dir_ent = ls_data_dir.next_entry().await?;
let data_dir_ent = match data_dir_ent {
Some(x) => x,
None => break,
};
let name = data_dir_ent.file_name();
let name = match name.into_string() {
Ok(x) => x,
@ -466,7 +486,7 @@ impl BlockManager {
};
let mut hash = [0u8; 32];
hash.copy_from_slice(&hash_bytes[..]);
self.put_to_resync(&hash.into(), 0)?;
self.put_to_resync(&hash.into(), Duration::from_secs(0))?;
}
if *must_exit.borrow() {
@ -477,32 +497,19 @@ impl BlockManager {
}
.boxed()
}
}
fn u64_from_bytes(bytes: &[u8]) -> u64 {
assert!(bytes.len() == 8);
let mut x8 = [0u8; 8];
x8.copy_from_slice(bytes);
u64::from_be_bytes(x8)
}
pub fn resync_queue_len(&self) -> usize {
self.resync_queue.len()
}
fn rc_merge(_key: &[u8], old: Option<&[u8]>, new: &[u8]) -> Option<Vec<u8>> {
let old = old.map(u64_from_bytes).unwrap_or(0);
assert!(new.len() == 1);
let new = match new[0] {
0 => {
if old > 0 {
old - 1
} else {
0
}
}
1 => old + 1,
_ => unreachable!(),
};
if new == 0 {
None
} else {
Some(u64::to_be_bytes(new).to_vec())
pub fn rc_len(&self) -> usize {
self.rc.len()
}
}
fn u64_from_be_bytes<T: AsRef<[u8]>>(bytes: T) -> u64 {
assert!(bytes.as_ref().len() == 8);
let mut x8 = [0u8; 8];
x8.copy_from_slice(bytes.as_ref());
u64::from_be_bytes(x8)
}

View file

@ -1,9 +1,9 @@
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use garage_util::background::*;
use garage_util::data::*;
use garage_table::crdt::CRDT;
use garage_table::*;
use crate::block::*;
@ -17,7 +17,7 @@ pub struct BlockRef {
pub version: UUID,
// Keep track of deleted status
pub deleted: bool,
pub deleted: crdt::Bool,
}
impl Entry<Hash, UUID> for BlockRef {
@ -27,16 +27,18 @@ impl Entry<Hash, UUID> for BlockRef {
fn sort_key(&self) -> &UUID {
&self.version
}
fn is_tombstone(&self) -> bool {
self.deleted.get()
}
}
impl CRDT for BlockRef {
fn merge(&mut self, other: &Self) {
if other.deleted {
self.deleted = true;
}
self.deleted.merge(&other.deleted);
}
}
pub struct BlockRefTable {
pub background: Arc<BackgroundRunner>,
pub block_manager: Arc<BlockManager>,
}
@ -48,8 +50,8 @@ impl TableSchema for BlockRefTable {
fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
let block = &old.as_ref().or(new.as_ref()).unwrap().block;
let was_before = old.as_ref().map(|x| !x.deleted).unwrap_or(false);
let is_after = new.as_ref().map(|x| !x.deleted).unwrap_or(false);
let was_before = old.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
let is_after = new.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
if is_after && !was_before {
if let Err(e) = self.block_manager.block_incref(block) {
warn!("block_incref failed for block {:?}: {}", block, e);
@ -63,6 +65,6 @@ impl TableSchema for BlockRefTable {
}
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.deleted)
filter.apply(entry.deleted.get())
}
}

View file

@ -5,11 +5,6 @@ use garage_table::*;
use crate::key_table::PermissionSet;
// We import the same file but in its version 0.1.0.
// We can then access v0.1.0 data structures.
// We use them to perform migrations.
use model010::bucket_table as prev;
/// A bucket is a collection of objects
///
/// Its parameters are not directly accessible as:
@ -89,7 +84,9 @@ impl Entry<EmptyKey, String> for Bucket {
fn sort_key(&self) -> &String {
&self.name
}
}
impl CRDT for Bucket {
fn merge(&mut self, other: &Self) {
self.state.merge(&other.state);
}
@ -106,39 +103,4 @@ impl TableSchema for BucketTable {
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.is_deleted())
}
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old = match rmp_serde::decode::from_read_ref::<_, prev::Bucket>(bytes) {
Ok(x) => x,
Err(_) => return None,
};
if old.deleted {
Some(Bucket {
name: old.name,
state: crdt::LWW::migrate_from_raw(old.timestamp, BucketState::Deleted),
})
} else {
let mut keys = crdt::LWWMap::new();
for ak in old.authorized_keys() {
keys.merge(&crdt::LWWMap::migrate_from_raw_item(
ak.key_id.clone(),
ak.timestamp,
PermissionSet {
allow_read: ak.allow_read,
allow_write: ak.allow_write,
},
));
}
let params = BucketParams {
authorized_keys: keys,
website: crdt::LWW::new(false),
};
Some(Bucket {
name: old.name,
state: crdt::LWW::migrate_from_raw(old.timestamp, BucketState::Present(params)),
})
}
}
}

View file

@ -7,8 +7,8 @@ use garage_rpc::membership::System;
use garage_rpc::rpc_client::RpcHttpClient;
use garage_rpc::rpc_server::RpcServer;
use garage_table::table_fullcopy::*;
use garage_table::table_sharded::*;
use garage_table::replication::fullcopy::*;
use garage_table::replication::sharded::*;
use garage_table::*;
use crate::block::*;
@ -35,7 +35,7 @@ pub struct Garage {
}
impl Garage {
pub async fn new(
pub fn new(
config: Config,
db: sled::Db,
background: Arc<BackgroundRunner>,
@ -54,18 +54,23 @@ impl Garage {
);
let data_rep_param = TableShardedReplication {
system: system.clone(),
replication_factor: config.data_replication_factor,
write_quorum: (config.data_replication_factor + 1) / 2,
read_quorum: 1,
};
let meta_rep_param = TableShardedReplication {
system: system.clone(),
replication_factor: config.meta_replication_factor,
write_quorum: (config.meta_replication_factor + 1) / 2,
read_quorum: (config.meta_replication_factor + 1) / 2,
};
let control_rep_param = TableFullReplication::new(config.control_write_max_faults);
let control_rep_param = TableFullReplication {
system: system.clone(),
max_faults: config.control_write_max_faults,
};
info!("Initialize block manager...");
let block_manager = BlockManager::new(
@ -79,7 +84,6 @@ impl Garage {
info!("Initialize block_ref_table...");
let block_ref_table = Table::new(
BlockRefTable {
background: background.clone(),
block_manager: block_manager.clone(),
},
data_rep_param.clone(),
@ -87,8 +91,7 @@ impl Garage {
&db,
"block_ref".to_string(),
rpc_server,
)
.await;
);
info!("Initialize version_table...");
let version_table = Table::new(
@ -101,8 +104,7 @@ impl Garage {
&db,
"version".to_string(),
rpc_server,
)
.await;
);
info!("Initialize object_table...");
let object_table = Table::new(
@ -115,8 +117,7 @@ impl Garage {
&db,
"object".to_string(),
rpc_server,
)
.await;
);
info!("Initialize bucket_table...");
let bucket_table = Table::new(
@ -126,8 +127,7 @@ impl Garage {
&db,
"bucket".to_string(),
rpc_server,
)
.await;
);
info!("Initialize key_table_table...");
let key_table = Table::new(
@ -137,8 +137,7 @@ impl Garage {
&db,
"key".to_string(),
rpc_server,
)
.await;
);
info!("Initialize Garage...");
let garage = Arc::new(Self {
@ -156,7 +155,7 @@ impl Garage {
info!("Start block manager background thread...");
garage.block_manager.garage.swap(Some(garage.clone()));
garage.block_manager.clone().spawn_background_worker().await;
garage.block_manager.clone().spawn_background_worker();
garage
}

View file

@ -1,10 +1,8 @@
use serde::{Deserialize, Serialize};
use garage_table::crdt::CRDT;
use garage_table::crdt::*;
use garage_table::*;
use model010::key_table as prev;
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct Key {
// Primary key
@ -66,6 +64,10 @@ pub struct PermissionSet {
pub allow_write: bool,
}
impl AutoCRDT for PermissionSet {
const WARN_IF_DIFFERENT: bool = true;
}
impl Entry<EmptyKey, String> for Key {
fn partition_key(&self) -> &EmptyKey {
&EmptyKey
@ -73,55 +75,43 @@ impl Entry<EmptyKey, String> for Key {
fn sort_key(&self) -> &String {
&self.key_id
}
}
impl CRDT for Key {
fn merge(&mut self, other: &Self) {
self.name.merge(&other.name);
self.deleted.merge(&other.deleted);
if self.deleted.get() {
self.authorized_buckets.clear();
return;
} else {
self.authorized_buckets.merge(&other.authorized_buckets);
}
self.authorized_buckets.merge(&other.authorized_buckets);
}
}
pub struct KeyTable;
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum KeyFilter {
Deleted(DeletedFilter),
Matches(String),
}
impl TableSchema for KeyTable {
type P = EmptyKey;
type S = String;
type E = Key;
type Filter = DeletedFilter;
type Filter = KeyFilter;
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.deleted.get())
}
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old = match rmp_serde::decode::from_read_ref::<_, prev::Key>(bytes) {
Ok(x) => x,
Err(_) => return None,
};
let mut new = Self::E {
key_id: old.key_id.clone(),
secret_key: old.secret_key.clone(),
name: crdt::LWW::migrate_from_raw(old.name_timestamp, old.name.clone()),
deleted: crdt::Bool::new(old.deleted),
authorized_buckets: crdt::LWWMap::new(),
};
for ab in old.authorized_buckets() {
let it = crdt::LWWMap::migrate_from_raw_item(
ab.bucket.clone(),
ab.timestamp,
PermissionSet {
allow_read: ab.allow_read,
allow_write: ab.allow_write,
},
);
new.authorized_buckets.merge(&it);
match filter {
KeyFilter::Deleted(df) => df.apply(entry.deleted.get()),
KeyFilter::Matches(pat) => {
let pat = pat.to_lowercase();
entry.key_id.to_lowercase().starts_with(&pat)
|| entry.name.get().to_lowercase() == pat
}
}
Some(new)
}
}

View file

@ -5,13 +5,12 @@ use std::sync::Arc;
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_table::table_sharded::*;
use garage_table::crdt::*;
use garage_table::replication::sharded::*;
use garage_table::*;
use crate::version_table::*;
use model010::object_table as prev;
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct Object {
// Primary key
@ -70,7 +69,7 @@ pub enum ObjectVersionState {
Aborted,
}
impl ObjectVersionState {
impl CRDT for ObjectVersionState {
fn merge(&mut self, other: &Self) {
use ObjectVersionState::*;
match other {
@ -91,37 +90,30 @@ impl ObjectVersionState {
}
}
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub enum ObjectVersionData {
DeleteMarker,
Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
FirstBlock(ObjectVersionMeta, Hash),
}
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
impl AutoCRDT for ObjectVersionData {
const WARN_IF_DIFFERENT: bool = true;
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionMeta {
pub headers: ObjectVersionHeaders,
pub size: u64,
pub etag: String,
}
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersionHeaders {
pub content_type: String,
pub other: BTreeMap<String, String>,
}
impl ObjectVersionData {
fn merge(&mut self, b: &Self) {
if *self != *b {
warn!(
"Inconsistent object version data: {:?} (local) vs {:?} (remote)",
self, b
);
}
}
}
impl ObjectVersion {
fn cmp_key(&self) -> (u64, UUID) {
(self.timestamp, self.uuid)
@ -154,8 +146,11 @@ impl Entry<String, String> for Object {
fn sort_key(&self) -> &String {
&self.key
}
}
impl CRDT for Object {
fn merge(&mut self, other: &Self) {
// Merge versions from other into here
for other_v in other.versions.iter() {
match self
.versions
@ -169,6 +164,9 @@ impl Entry<String, String> for Object {
}
}
}
// Remove versions which are obsolete, i.e. those that come
// before the last version which .is_complete().
let last_complete = self
.versions
.iter()
@ -212,13 +210,8 @@ impl TableSchema for ObjectTable {
}
};
if newly_deleted {
let deleted_version = Version::new(
v.uuid,
old_v.bucket.clone(),
old_v.key.clone(),
true,
vec![],
);
let deleted_version =
Version::new(v.uuid, old_v.bucket.clone(), old_v.key.clone(), true);
version_table.insert(&deleted_version).await?;
}
}
@ -231,55 +224,4 @@ impl TableSchema for ObjectTable {
let deleted = !entry.versions.iter().any(|v| v.is_data());
filter.apply(deleted)
}
fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
let old = match rmp_serde::decode::from_read_ref::<_, prev::Object>(bytes) {
Ok(x) => x,
Err(_) => return None,
};
let new_v = old
.versions()
.iter()
.map(migrate_version)
.collect::<Vec<_>>();
let new = Object::new(old.bucket.clone(), old.key.clone(), new_v);
Some(new)
}
}
fn migrate_version(old: &prev::ObjectVersion) -> ObjectVersion {
let headers = ObjectVersionHeaders {
content_type: old.mime_type.clone(),
other: BTreeMap::new(),
};
let meta = ObjectVersionMeta {
headers: headers.clone(),
size: old.size,
etag: "".to_string(),
};
let state = match old.state {
prev::ObjectVersionState::Uploading => ObjectVersionState::Uploading(headers),
prev::ObjectVersionState::Aborted => ObjectVersionState::Aborted,
prev::ObjectVersionState::Complete => match &old.data {
prev::ObjectVersionData::Uploading => ObjectVersionState::Uploading(headers),
prev::ObjectVersionData::DeleteMarker => {
ObjectVersionState::Complete(ObjectVersionData::DeleteMarker)
}
prev::ObjectVersionData::Inline(x) => {
ObjectVersionState::Complete(ObjectVersionData::Inline(meta, x.clone()))
}
prev::ObjectVersionData::FirstBlock(h) => {
let mut hash = [0u8; 32];
hash.copy_from_slice(h.as_ref());
ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, Hash::from(hash)))
}
},
};
let mut uuid = [0u8; 32];
uuid.copy_from_slice(old.uuid.as_ref());
ObjectVersion {
uuid: UUID::from(uuid),
timestamp: old.timestamp,
state,
}
}

View file

@ -4,7 +4,8 @@ use std::sync::Arc;
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_table::table_sharded::*;
use garage_table::crdt::*;
use garage_table::replication::sharded::*;
use garage_table::*;
use crate::block_ref_table::*;
@ -15,8 +16,11 @@ pub struct Version {
pub uuid: UUID,
// Actual data: the blocks for this version
pub deleted: bool,
blocks: Vec<VersionBlock>,
// In the case of a multipart upload, also store the etags
// of individual parts and check them when doing CompleteMultipartUpload
pub deleted: crdt::Bool,
pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
pub parts_etags: crdt::Map<u64, String>,
// Back link to bucket+key so that we can figure if
// this was deleted later on
@ -25,56 +29,46 @@ pub struct Version {
}
impl Version {
pub fn new(
uuid: UUID,
bucket: String,
key: String,
deleted: bool,
blocks: Vec<VersionBlock>,
) -> Self {
let mut ret = Self {
pub fn new(uuid: UUID, bucket: String, key: String, deleted: bool) -> Self {
Self {
uuid,
deleted,
blocks: vec![],
deleted: deleted.into(),
blocks: crdt::Map::new(),
parts_etags: crdt::Map::new(),
bucket,
key,
};
for b in blocks {
ret.add_block(b)
.expect("Twice the same VersionBlock in Version constructor");
}
ret
}
/// Adds a block if it wasn't already present
pub fn add_block(&mut self, new: VersionBlock) -> Result<(), ()> {
match self
.blocks
.binary_search_by(|b| b.cmp_key().cmp(&new.cmp_key()))
{
Err(i) => {
self.blocks.insert(i, new);
Ok(())
}
Ok(_) => Err(()),
}
}
pub fn blocks(&self) -> &[VersionBlock] {
&self.blocks[..]
}
}
#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct VersionBlock {
#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlockKey {
pub part_number: u64,
pub offset: u64,
}
impl Ord for VersionBlockKey {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.part_number
.cmp(&other.part_number)
.then(self.offset.cmp(&other.offset))
}
}
impl PartialOrd for VersionBlockKey {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
pub struct VersionBlock {
pub hash: Hash,
pub size: u64,
}
impl VersionBlock {
fn cmp_key(&self) -> (u64, u64) {
(self.part_number, self.offset)
}
impl AutoCRDT for VersionBlock {
const WARN_IF_DIFFERENT: bool = true;
}
impl Entry<Hash, EmptyKey> for Version {
@ -84,23 +78,21 @@ impl Entry<Hash, EmptyKey> for Version {
fn sort_key(&self) -> &EmptyKey {
&EmptyKey
}
fn is_tombstone(&self) -> bool {
self.deleted.get()
}
}
impl CRDT for Version {
fn merge(&mut self, other: &Self) {
if other.deleted {
self.deleted = true;
self.deleted.merge(&other.deleted);
if self.deleted.get() {
self.blocks.clear();
} else if !self.deleted {
for bi in other.blocks.iter() {
match self
.blocks
.binary_search_by(|x| x.cmp_key().cmp(&bi.cmp_key()))
{
Ok(_) => (),
Err(pos) => {
self.blocks.insert(pos, bi.clone());
}
}
}
self.parts_etags.clear();
} else {
self.blocks.merge(&other.blocks);
self.parts_etags.merge(&other.parts_etags);
}
}
}
@ -121,14 +113,15 @@ impl TableSchema for VersionTable {
self.background.spawn(async move {
if let (Some(old_v), Some(new_v)) = (old, new) {
// Propagate deletion of version blocks
if new_v.deleted && !old_v.deleted {
if new_v.deleted.get() && !old_v.deleted.get() {
let deleted_block_refs = old_v
.blocks
.items()
.iter()
.map(|vb| BlockRef {
.map(|(_k, vb)| BlockRef {
block: vb.hash,
version: old_v.uuid,
deleted: true,
deleted: true.into(),
})
.collect::<Vec<_>>();
block_ref_table.insert_many(&deleted_block_refs[..]).await?;
@ -139,6 +132,6 @@ impl TableSchema for VersionTable {
}
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
filter.apply(entry.deleted)
filter.apply(entry.deleted.get())
}
}

View file

@ -15,27 +15,26 @@ path = "lib.rs"
[dependencies]
garage_util = { version = "0.1.1", path = "../util" }
bytes = "0.4"
rand = "0.7"
hex = "0.3"
sha2 = "0.8"
arc-swap = "0.4"
bytes = "1.0"
hex = "0.4"
arc-swap = "1.0"
gethostname = "0.2"
log = "0.4"
rmp-serde = "0.14.3"
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_json = "1.0"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
tokio-stream = { version = "0.1", features = ["net"] }
http = "0.2"
hyper = "0.13"
rustls = "0.17"
tokio-rustls = "0.13"
hyper-rustls = { version = "0.20", default-features = false }
hyper = { version = "0.14", features = ["full"] }
rustls = "0.19"
tokio-rustls = "0.22"
hyper-rustls = { version = "0.22", default-features = false }
webpki = "0.21"

View file

@ -11,13 +11,14 @@ use futures::future::join_all;
use futures::select;
use futures_util::future::*;
use serde::{Deserialize, Serialize};
use tokio::prelude::*;
use tokio::io::AsyncWriteExt;
use tokio::sync::watch;
use tokio::sync::Mutex;
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_util::error::Error;
use garage_util::time::*;
use crate::consul::get_consul_nodes;
use crate::ring::*;
@ -315,23 +316,17 @@ impl System {
self.clone().ping_nodes(bootstrap_peers).await;
let self2 = self.clone();
self.clone()
.background
self.background
.spawn_worker(format!("ping loop"), |stop_signal| {
self2.ping_loop(stop_signal).map(Ok)
})
.await;
self2.ping_loop(stop_signal)
});
if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) {
let self2 = self.clone();
self.clone()
.background
self.background
.spawn_worker(format!("Consul loop"), |stop_signal| {
self2
.consul_loop(stop_signal, consul_host, consul_service_name)
.map(Ok)
})
.await;
self2.consul_loop(stop_signal, consul_host, consul_service_name)
});
}
}
@ -399,7 +394,7 @@ impl System {
if has_changes {
status.recalculate_hash();
}
if let Err(e) = update_locked.0.broadcast(Arc::new(status)) {
if let Err(e) = update_locked.0.send(Arc::new(status)) {
error!("In ping_nodes: could not save status update ({})", e);
}
drop(update_locked);
@ -425,7 +420,7 @@ impl System {
let status_hash = status.hash;
let config_version = self.ring.borrow().config.version;
update_locked.0.broadcast(Arc::new(status))?;
update_locked.0.send(Arc::new(status))?;
drop(update_locked);
if is_new || status_hash != ping.status_hash {
@ -507,7 +502,7 @@ impl System {
if has_changed {
status.recalculate_hash();
}
update_lock.0.broadcast(Arc::new(status))?;
update_lock.0.send(Arc::new(status))?;
drop(update_lock);
if to_ping.len() > 0 {
@ -527,7 +522,7 @@ impl System {
if adv.version > ring.config.version {
let ring = Ring::new(adv.clone());
update_lock.1.broadcast(Arc::new(ring))?;
update_lock.1.send(Arc::new(ring))?;
drop(update_lock);
self.background.spawn_cancellable(
@ -543,7 +538,7 @@ impl System {
async fn ping_loop(self: Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
loop {
let restart_at = tokio::time::delay_for(PING_INTERVAL);
let restart_at = tokio::time::sleep(PING_INTERVAL);
let status = self.status.borrow().clone();
let ping_addrs = status
@ -557,10 +552,9 @@ impl System {
select! {
_ = restart_at.fuse() => (),
must_exit = stop_signal.recv().fuse() => {
match must_exit {
None | Some(true) => return,
_ => (),
_ = stop_signal.changed().fuse() => {
if *stop_signal.borrow() {
return;
}
}
}
@ -573,8 +567,8 @@ impl System {
consul_host: String,
consul_service_name: String,
) {
loop {
let restart_at = tokio::time::delay_for(CONSUL_INTERVAL);
while !*stop_signal.borrow() {
let restart_at = tokio::time::sleep(CONSUL_INTERVAL);
match get_consul_nodes(&consul_host, &consul_service_name).await {
Ok(mut node_list) => {
@ -588,12 +582,7 @@ impl System {
select! {
_ = restart_at.fuse() => (),
must_exit = stop_signal.recv().fuse() => {
match must_exit {
None | Some(true) => return,
_ => (),
}
}
_ = stop_signal.changed().fuse() => (),
}
}
}

View file

@ -5,6 +5,11 @@ use serde::{Deserialize, Serialize};
use garage_util::data::*;
// A partition number is encoded on 16 bits,
// i.e. we have up to 2**16 partitions.
// (in practice we have exactly 2**PARTITION_BITS partitions)
pub type Partition = u16;
// TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump
// it up to 10.
@ -161,29 +166,48 @@ impl Ring {
})
.collect::<Vec<_>>();
eprintln!("RING: --");
for e in ring.iter() {
eprintln!("{:?}", e);
}
eprintln!("END --");
// eprintln!("RING: --");
// for e in ring.iter() {
// eprintln!("{:?}", e);
// }
// eprintln!("END --");
Self { config, ring }
}
pub fn partition_of(&self, from: &Hash) -> Partition {
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
pub fn partitions(&self) -> Vec<(Partition, Hash)> {
let mut ret = vec![];
for (i, entry) in self.ring.iter().enumerate() {
ret.push((i as u16, entry.location));
}
if ret.len() > 0 {
assert_eq!(ret[0].1, [0u8; 32].into());
}
ret
}
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if self.ring.len() != 1 << PARTITION_BITS {
warn!("Ring not yet ready, read/writes will be lost");
warn!("Ring not yet ready, read/writes will be lost!");
return vec![];
}
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
assert_eq!(partition_idx, self.partition_of(from) as usize);
let partition = &self.ring[partition_idx];
let partition_top =
u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap());
assert!(partition_top & PARTITION_MASK_U16 == top & PARTITION_MASK_U16);
assert_eq!(partition_top & PARTITION_MASK_U16, top & PARTITION_MASK_U16);
assert!(n <= partition.nodes.len());
partition.nodes[..n].iter().cloned().collect::<Vec<_>>()

View file

@ -7,7 +7,6 @@ use std::sync::Arc;
use std::time::Duration;
use arc_swap::ArcSwapOption;
use bytes::IntoBuf;
use futures::future::Future;
use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt;
@ -197,11 +196,8 @@ impl<M: RpcMessage + 'static> RpcClient<M> {
if !strategy.rs_interrupt_after_quorum {
let wait_finished_fut = tokio::spawn(async move {
resp_stream.collect::<Vec<_>>().await;
Ok(())
});
self.background.spawn(wait_finished_fut.map(|x| {
x.unwrap_or_else(|e| Err(Error::Message(format!("Await failed: {}", e))))
}));
self.background.spawn(wait_finished_fut.map(|_| Ok(())));
}
Ok(results)
@ -336,7 +332,7 @@ impl RpcHttpClient {
let body = hyper::body::to_bytes(resp.into_body()).await?;
drop(slot);
match rmp_serde::decode::from_read::<_, Result<M, String>>(body.into_buf())? {
match rmp_serde::decode::from_read::<_, Result<M, String>>(&body[..])? {
Err(e) => Ok(Err(Error::RemoteError(e, status))),
Ok(x) => Ok(Ok(x)),
}

View file

@ -4,7 +4,6 @@ use std::pin::Pin;
use std::sync::Arc;
use std::time::Instant;
use bytes::IntoBuf;
use futures::future::Future;
use futures_util::future::*;
use futures_util::stream::*;
@ -15,6 +14,7 @@ use serde::{Deserialize, Serialize};
use tokio::net::{TcpListener, TcpStream};
use tokio_rustls::server::TlsStream;
use tokio_rustls::TlsAcceptor;
use tokio_stream::wrappers::TcpListenerStream;
use garage_util::config::TlsConfig;
use garage_util::data::*;
@ -47,11 +47,15 @@ where
{
let begin_time = Instant::now();
let whole_body = hyper::body::to_bytes(req.into_body()).await?;
let msg = rmp_serde::decode::from_read::<_, M>(whole_body.into_buf())?;
let msg = rmp_serde::decode::from_read::<_, M>(&whole_body[..])?;
trace!(
"Request message: {}",
serde_json::to_string(&msg).unwrap_or("<json error>".into())
serde_json::to_string(&msg)
.unwrap_or("<json error>".into())
.chars()
.take(100)
.collect::<String>()
);
match handler(msg, sockaddr).await {
@ -171,8 +175,8 @@ impl RpcServer {
config.set_single_cert([&node_certs[..], &ca_certs[..]].concat(), node_key)?;
let tls_acceptor = Arc::new(TlsAcceptor::from(Arc::new(config)));
let mut listener = TcpListener::bind(&self.bind_addr).await?;
let incoming = listener.incoming().filter_map(|socket| async {
let listener = TcpListener::bind(&self.bind_addr).await?;
let incoming = TcpListenerStream::new(listener).filter_map(|socket| async {
match socket {
Ok(stream) => match tls_acceptor.clone().accept(stream).await {
Ok(x) => Some(Ok::<_, hyper::Error>(x)),

View file

@ -16,21 +16,18 @@ path = "lib.rs"
garage_util = { version = "0.1.1", path = "../util" }
garage_rpc = { version = "0.1.1", path = "../rpc" }
bytes = "0.4"
rand = "0.7"
hex = "0.3"
arc-swap = "0.4"
bytes = "1.0"
rand = "0.8"
log = "0.4"
hexdump = "0.1"
sled = "0.34"
rmp-serde = "0.14.3"
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11"
async-trait = "0.1.30"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }

View file

@ -1,327 +0,0 @@
//! This package provides a simple implementation of conflict-free replicated data types (CRDTs)
//!
//! CRDTs are a type of data structures that do not require coordination. In other words, we can
//! edit them in parallel, we will always find a way to merge it.
//!
//! A general example is a counter. Its initial value is 0. Alice and Bob get a copy of the
//! counter. Alice does +1 on her copy, she reads 1. Bob does +3 on his copy, he reads 3. Now,
//! it is easy to merge their counters, order does not count: we always get 4.
//!
//! Learn more about CRDT [on Wikipedia](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type)
use serde::{Deserialize, Serialize};
use garage_util::data::*;
/// Definition of a CRDT - all CRDT Rust types implement this.
///
/// A CRDT is defined as a merge operator that respects a certain set of axioms.
///
/// In particular, the merge operator must be commutative, associative,
/// idempotent, and monotonic.
/// In other words, if `a`, `b` and `c` are CRDTs, and `⊔` denotes the merge operator,
/// the following axioms must apply:
///
/// ```text
/// a ⊔ b = b ⊔ a (commutativity)
/// (a ⊔ b) ⊔ c = a ⊔ (b ⊔ c) (associativity)
/// (a ⊔ b) ⊔ b = a ⊔ b (idempotence)
/// ```
///
/// Moreover, the relationship `≥` defined by `a ≥ b ⇔ ∃c. a = b ⊔ c` must be a partial order.
/// This implies a few properties such as: if `a ⊔ b ≠ a`, then there is no `c` such that `(a ⊔ b) ⊔ c = a`,
/// as this would imply a cycle in the partial order.
pub trait CRDT {
/// Merge the two datastructures according to the CRDT rules.
/// `self` is modified to contain the merged CRDT value. `other` is not modified.
///
/// # Arguments
///
/// * `other` - the other CRDT we wish to merge with
fn merge(&mut self, other: &Self);
}
/// All types that implement `Ord` (a total order) also implement a trivial CRDT
/// defined by the merge rule: `a ⊔ b = max(a, b)`.
impl<T> CRDT for T
where
T: Ord + Clone,
{
fn merge(&mut self, other: &Self) {
if other > self {
*self = other.clone();
}
}
}
// ---- LWW Register ----
/// Last Write Win (LWW)
///
/// An LWW CRDT associates a timestamp with a value, in order to implement a
/// time-based reconciliation rule: the most recent write wins.
/// For completeness, the LWW reconciliation rule must also be defined for two LWW CRDTs
/// with the same timestamp but different values.
///
/// In our case, we add the constraint that the value that is wrapped inside the LWW CRDT must
/// itself be a CRDT: in the case when the timestamp does not allow us to decide on which value to
/// keep, the merge rule of the inner CRDT is applied on the wrapped values. (Note that all types
/// that implement the `Ord` trait get a default CRDT implemetnation that keeps the maximum value.
/// This enables us to use LWW directly with primitive data types such as numbers or strings. It is
/// generally desirable in this case to never explicitly produce LWW values with the same timestamp
/// but different inner values, as the rule to keep the maximum value isn't generally the desired
/// semantics.)
///
/// As multiple computers clocks are always desynchronized,
/// when operations are close enough, it is equivalent to
/// take one copy and drop the other one.
///
/// Given that clocks are not too desynchronized, this assumption
/// is enough for most cases, as there is few chance that two humans
/// coordonate themself faster than the time difference between two NTP servers.
///
/// As a more concret example, let's suppose you want to upload a file
/// with the same key (path) in the same bucket at the very same time.
/// For each request, the file will be timestamped by the receiving server
/// and may differ from what you observed with your atomic clock!
///
/// This scheme is used by AWS S3 or Soundcloud and often without knowing
/// in entreprise when reconciliating databases with ad-hoc scripts.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWW<T> {
ts: u64,
v: T,
}
impl<T> LWW<T>
where
T: CRDT,
{
/// Creates a new CRDT
///
/// CRDT's internal timestamp is set with current node's clock.
pub fn new(value: T) -> Self {
Self {
ts: now_msec(),
v: value,
}
}
/// Build a new CRDT from a previous non-compatible one
///
/// Compared to new, the CRDT's timestamp is not set to now
/// but must be set to the previous, non-compatible, CRDT's timestamp.
pub fn migrate_from_raw(ts: u64, value: T) -> Self {
Self { ts, v: value }
}
/// Update the LWW CRDT while keeping some causal ordering.
///
/// The timestamp of the LWW CRDT is updated to be the current node's clock
/// at time of update, or the previous timestamp + 1 if that's bigger,
/// so that the new timestamp is always strictly larger than the previous one.
/// This ensures that merging the update with the old value will result in keeping
/// the updated value.
pub fn update(&mut self, new_value: T) {
self.ts = std::cmp::max(self.ts + 1, now_msec());
self.v = new_value;
}
/// Get the CRDT value
pub fn get(&self) -> &T {
&self.v
}
/// Get a mutable reference to the CRDT's value
///
/// This is usefull to mutate the inside value without changing the LWW timestamp.
/// When such mutation is done, the merge between two LWW values is done using the inner
/// CRDT's merge operation. This is usefull in the case where the inner CRDT is a large
/// data type, such as a map, and we only want to change a single item in the map.
/// To do this, we can produce a "CRDT delta", i.e. a LWW that contains only the modification.
/// This delta consists in a LWW with the same timestamp, and the map
/// inside only contains the updated value.
/// The advantage of such a delta is that it is much smaller than the whole map.
///
/// Avoid using this if the inner data type is a primitive type such as a number or a string,
/// as you will then rely on the merge function defined on `Ord` types by keeping the maximum
/// of both values.
pub fn get_mut(&mut self) -> &mut T {
&mut self.v
}
}
impl<T> CRDT for LWW<T>
where
T: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
if other.ts > self.ts {
self.ts = other.ts;
self.v = other.v.clone();
} else if other.ts == self.ts {
self.v.merge(&other.v);
}
}
}
/// Boolean, where `true` is an absorbing state
#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
pub struct Bool(bool);
impl Bool {
/// Create a new boolean with the specified value
pub fn new(b: bool) -> Self {
Self(b)
}
/// Set the boolean to true
pub fn set(&mut self) {
self.0 = true;
}
/// Get the boolean value
pub fn get(&self) -> bool {
self.0
}
}
impl CRDT for Bool {
fn merge(&mut self, other: &Self) {
self.0 = self.0 || other.0;
}
}
/// Last Write Win Map
///
/// This types defines a CRDT for a map from keys to values.
/// The values have an associated timestamp, such that the last written value
/// takes precedence over previous ones. As for the simpler `LWW` type, the value
/// type `V` is also required to implement the CRDT trait.
/// We do not encourage mutating the values associated with a given key
/// without updating the timestamp, in fact at the moment we do not provide a `.get_mut()`
/// method that would allow that.
///
/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
/// such that two values can be compared for equality based on their hashes). As a consequence,
/// insertions take `O(n)` time. This means that LWWMap should be used for reasonably small maps.
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWWMap<K, V> {
vals: Vec<(K, u64, V)>,
}
impl<K, V> LWWMap<K, V>
where
K: Ord,
V: CRDT,
{
/// Create a new empty map CRDT
pub fn new() -> Self {
Self { vals: vec![] }
}
/// Used to migrate from a map defined in an incompatible format. This produces
/// a map that contains a single item with the specified timestamp (copied from
/// the incompatible format). Do this as many times as you have items to migrate,
/// and put them all together using the CRDT merge operator.
pub fn migrate_from_raw_item(k: K, ts: u64, v: V) -> Self {
Self {
vals: vec![(k, ts, v)],
}
}
/// Returns a map that contains a single mapping from the specified key to the specified value.
/// This map is a mutator, or a delta-CRDT, such that when it is merged with the original map,
/// the previous value will be replaced with the one specified here.
/// The timestamp in the provided mutator is set to the maximum of the current system's clock
/// and 1 + the previous value's timestamp (if there is one), so that the new value will always
/// take precedence (LWW rule).
///
/// Typically, to update the value associated to a key in the map, you would do the following:
///
/// ```ignore
/// let my_update = my_crdt.update_mutator(key_to_modify, new_value);
/// my_crdt.merge(&my_update);
/// ```
///
/// However extracting the mutator on its own and only sending that on the network is very
/// interesting as it is much smaller than the whole map.
pub fn update_mutator(&self, k: K, new_v: V) -> Self {
let new_vals = match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, old_ts, _) = self.vals[i];
let new_ts = std::cmp::max(old_ts + 1, now_msec());
vec![(k, new_ts, new_v)]
}
Err(_) => vec![(k, now_msec(), new_v)],
};
Self { vals: new_vals }
}
/// Takes all of the values of the map and returns them. The current map is reset to the
/// empty map. This is very usefull to produce in-place a new map that contains only a delta
/// that modifies a certain value:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a = a.take_and_clear();
/// a.merge(&old_a.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
///
/// Of course in this simple example we could have written simply
/// `pyt_my_crdt_value(a.update_mutator(key_to_modify, new_value))`,
/// but in the case where the map is a field in a struct for instance (as is always the case),
/// this becomes very handy:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a_map = a.map_field.take_and_clear();
/// a.map_field.merge(&old_a_map.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
pub fn take_and_clear(&mut self) -> Self {
let vals = std::mem::replace(&mut self.vals, vec![]);
Self { vals }
}
/// Removes all values from the map
pub fn clear(&mut self) {
self.vals.clear();
}
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => Some(&self.vals[i].2),
Err(_) => None,
}
}
/// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
/// In most case you will want to ignore the timestamp (second item of the tuple).
pub fn items(&self) -> &[(K, u64, V)] {
&self.vals[..]
}
}
impl<K, V> CRDT for LWWMap<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
for (k, ts2, v2) in other.vals.iter() {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, ts1, _v1) = &self.vals[i];
if ts2 > ts1 {
self.vals[i].1 = *ts2;
self.vals[i].2 = v2.clone();
} else if ts1 == ts2 {
self.vals[i].2.merge(&v2);
}
}
Err(i) => {
self.vals.insert(i, (k.clone(), *ts2, v2.clone()));
}
}
}
}
}

34
src/table/crdt/bool.rs Normal file
View file

@ -0,0 +1,34 @@
use serde::{Deserialize, Serialize};
use crate::crdt::crdt::*;
/// Boolean, where `true` is an absorbing state
#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
pub struct Bool(bool);
impl Bool {
/// Create a new boolean with the specified value
pub fn new(b: bool) -> Self {
Self(b)
}
/// Set the boolean to true
pub fn set(&mut self) {
self.0 = true;
}
/// Get the boolean value
pub fn get(&self) -> bool {
self.0
}
}
impl From<bool> for Bool {
fn from(b: bool) -> Bool {
Bool::new(b)
}
}
impl CRDT for Bool {
fn merge(&mut self, other: &Self) {
self.0 = self.0 || other.0;
}
}

73
src/table/crdt/crdt.rs Normal file
View file

@ -0,0 +1,73 @@
use garage_util::data::*;
/// Definition of a CRDT - all CRDT Rust types implement this.
///
/// A CRDT is defined as a merge operator that respects a certain set of axioms.
///
/// In particular, the merge operator must be commutative, associative,
/// idempotent, and monotonic.
/// In other words, if `a`, `b` and `c` are CRDTs, and `⊔` denotes the merge operator,
/// the following axioms must apply:
///
/// ```text
/// a ⊔ b = b ⊔ a (commutativity)
/// (a ⊔ b) ⊔ c = a ⊔ (b ⊔ c) (associativity)
/// (a ⊔ b) ⊔ b = a ⊔ b (idempotence)
/// ```
///
/// Moreover, the relationship `≥` defined by `a ≥ b ⇔ ∃c. a = b ⊔ c` must be a partial order.
/// This implies a few properties such as: if `a ⊔ b ≠ a`, then there is no `c` such that `(a ⊔ b) ⊔ c = a`,
/// as this would imply a cycle in the partial order.
pub trait CRDT {
/// Merge the two datastructures according to the CRDT rules.
/// `self` is modified to contain the merged CRDT value. `other` is not modified.
///
/// # Arguments
///
/// * `other` - the other CRDT we wish to merge with
fn merge(&mut self, other: &Self);
}
/// All types that implement `Ord` (a total order) can also implement a trivial CRDT
/// defined by the merge rule: `a ⊔ b = max(a, b)`. Implement this trait for your type
/// to enable this behavior.
pub trait AutoCRDT: Ord + Clone + std::fmt::Debug {
/// WARN_IF_DIFFERENT: emit a warning when values differ. Set this to true if
/// different values in your application should never happen. Set this to false
/// if you are actually relying on the semantics of `a ⊔ b = max(a, b)`.
const WARN_IF_DIFFERENT: bool;
}
impl<T> CRDT for T
where
T: AutoCRDT,
{
fn merge(&mut self, other: &Self) {
if Self::WARN_IF_DIFFERENT && self != other {
warn!(
"Different CRDT values should be the same (logic error!): {:?} vs {:?}",
self, other
);
if other > self {
*self = other.clone();
}
warn!("Making an arbitrary choice: {:?}", self);
} else {
if other > self {
*self = other.clone();
}
}
}
}
impl AutoCRDT for String {
const WARN_IF_DIFFERENT: bool = true;
}
impl AutoCRDT for bool {
const WARN_IF_DIFFERENT: bool = true;
}
impl AutoCRDT for FixedBytes32 {
const WARN_IF_DIFFERENT: bool = true;
}

114
src/table/crdt/lww.rs Normal file
View file

@ -0,0 +1,114 @@
use serde::{Deserialize, Serialize};
use garage_util::time::now_msec;
use crate::crdt::crdt::*;
/// Last Write Win (LWW)
///
/// An LWW CRDT associates a timestamp with a value, in order to implement a
/// time-based reconciliation rule: the most recent write wins.
/// For completeness, the LWW reconciliation rule must also be defined for two LWW CRDTs
/// with the same timestamp but different values.
///
/// In our case, we add the constraint that the value that is wrapped inside the LWW CRDT must
/// itself be a CRDT: in the case when the timestamp does not allow us to decide on which value to
/// keep, the merge rule of the inner CRDT is applied on the wrapped values. (Note that all types
/// that implement the `Ord` trait get a default CRDT implemetnation that keeps the maximum value.
/// This enables us to use LWW directly with primitive data types such as numbers or strings. It is
/// generally desirable in this case to never explicitly produce LWW values with the same timestamp
/// but different inner values, as the rule to keep the maximum value isn't generally the desired
/// semantics.)
///
/// As multiple computers clocks are always desynchronized,
/// when operations are close enough, it is equivalent to
/// take one copy and drop the other one.
///
/// Given that clocks are not too desynchronized, this assumption
/// is enough for most cases, as there is few chance that two humans
/// coordonate themself faster than the time difference between two NTP servers.
///
/// As a more concret example, let's suppose you want to upload a file
/// with the same key (path) in the same bucket at the very same time.
/// For each request, the file will be timestamped by the receiving server
/// and may differ from what you observed with your atomic clock!
///
/// This scheme is used by AWS S3 or Soundcloud and often without knowing
/// in entreprise when reconciliating databases with ad-hoc scripts.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWW<T> {
ts: u64,
v: T,
}
impl<T> LWW<T>
where
T: CRDT,
{
/// Creates a new CRDT
///
/// CRDT's internal timestamp is set with current node's clock.
pub fn new(value: T) -> Self {
Self {
ts: now_msec(),
v: value,
}
}
/// Build a new CRDT from a previous non-compatible one
///
/// Compared to new, the CRDT's timestamp is not set to now
/// but must be set to the previous, non-compatible, CRDT's timestamp.
pub fn migrate_from_raw(ts: u64, value: T) -> Self {
Self { ts, v: value }
}
/// Update the LWW CRDT while keeping some causal ordering.
///
/// The timestamp of the LWW CRDT is updated to be the current node's clock
/// at time of update, or the previous timestamp + 1 if that's bigger,
/// so that the new timestamp is always strictly larger than the previous one.
/// This ensures that merging the update with the old value will result in keeping
/// the updated value.
pub fn update(&mut self, new_value: T) {
self.ts = std::cmp::max(self.ts + 1, now_msec());
self.v = new_value;
}
/// Get the CRDT value
pub fn get(&self) -> &T {
&self.v
}
/// Get a mutable reference to the CRDT's value
///
/// This is usefull to mutate the inside value without changing the LWW timestamp.
/// When such mutation is done, the merge between two LWW values is done using the inner
/// CRDT's merge operation. This is usefull in the case where the inner CRDT is a large
/// data type, such as a map, and we only want to change a single item in the map.
/// To do this, we can produce a "CRDT delta", i.e. a LWW that contains only the modification.
/// This delta consists in a LWW with the same timestamp, and the map
/// inside only contains the updated value.
/// The advantage of such a delta is that it is much smaller than the whole map.
///
/// Avoid using this if the inner data type is a primitive type such as a number or a string,
/// as you will then rely on the merge function defined on `Ord` types by keeping the maximum
/// of both values.
pub fn get_mut(&mut self) -> &mut T {
&mut self.v
}
}
impl<T> CRDT for LWW<T>
where
T: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
if other.ts > self.ts {
self.ts = other.ts;
self.v = other.v.clone();
} else if other.ts == self.ts {
self.v.merge(&other.v);
}
}
}

145
src/table/crdt/lww_map.rs Normal file
View file

@ -0,0 +1,145 @@
use serde::{Deserialize, Serialize};
use garage_util::time::now_msec;
use crate::crdt::crdt::*;
/// Last Write Win Map
///
/// This types defines a CRDT for a map from keys to values.
/// The values have an associated timestamp, such that the last written value
/// takes precedence over previous ones. As for the simpler `LWW` type, the value
/// type `V` is also required to implement the CRDT trait.
/// We do not encourage mutating the values associated with a given key
/// without updating the timestamp, in fact at the moment we do not provide a `.get_mut()`
/// method that would allow that.
///
/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
/// such that two values can be compared for equality based on their hashes). As a consequence,
/// insertions take `O(n)` time. This means that LWWMap should be used for reasonably small maps.
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct LWWMap<K, V> {
vals: Vec<(K, u64, V)>,
}
impl<K, V> LWWMap<K, V>
where
K: Ord,
V: CRDT,
{
/// Create a new empty map CRDT
pub fn new() -> Self {
Self { vals: vec![] }
}
/// Used to migrate from a map defined in an incompatible format. This produces
/// a map that contains a single item with the specified timestamp (copied from
/// the incompatible format). Do this as many times as you have items to migrate,
/// and put them all together using the CRDT merge operator.
pub fn migrate_from_raw_item(k: K, ts: u64, v: V) -> Self {
Self {
vals: vec![(k, ts, v)],
}
}
/// Returns a map that contains a single mapping from the specified key to the specified value.
/// This map is a mutator, or a delta-CRDT, such that when it is merged with the original map,
/// the previous value will be replaced with the one specified here.
/// The timestamp in the provided mutator is set to the maximum of the current system's clock
/// and 1 + the previous value's timestamp (if there is one), so that the new value will always
/// take precedence (LWW rule).
///
/// Typically, to update the value associated to a key in the map, you would do the following:
///
/// ```ignore
/// let my_update = my_crdt.update_mutator(key_to_modify, new_value);
/// my_crdt.merge(&my_update);
/// ```
///
/// However extracting the mutator on its own and only sending that on the network is very
/// interesting as it is much smaller than the whole map.
pub fn update_mutator(&self, k: K, new_v: V) -> Self {
let new_vals = match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, old_ts, _) = self.vals[i];
let new_ts = std::cmp::max(old_ts + 1, now_msec());
vec![(k, new_ts, new_v)]
}
Err(_) => vec![(k, now_msec(), new_v)],
};
Self { vals: new_vals }
}
/// Takes all of the values of the map and returns them. The current map is reset to the
/// empty map. This is very usefull to produce in-place a new map that contains only a delta
/// that modifies a certain value:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a = a.take_and_clear();
/// a.merge(&old_a.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
///
/// Of course in this simple example we could have written simply
/// `pyt_my_crdt_value(a.update_mutator(key_to_modify, new_value))`,
/// but in the case where the map is a field in a struct for instance (as is always the case),
/// this becomes very handy:
///
/// ```ignore
/// let mut a = get_my_crdt_value();
/// let old_a_map = a.map_field.take_and_clear();
/// a.map_field.merge(&old_a_map.update_mutator(key_to_modify, new_value));
/// put_my_crdt_value(a);
/// ```
pub fn take_and_clear(&mut self) -> Self {
let vals = std::mem::replace(&mut self.vals, vec![]);
Self { vals }
}
/// Removes all values from the map
pub fn clear(&mut self) {
self.vals.clear();
}
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => Some(&self.vals[i].2),
Err(_) => None,
}
}
/// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
/// In most case you will want to ignore the timestamp (second item of the tuple).
pub fn items(&self) -> &[(K, u64, V)] {
&self.vals[..]
}
/// Returns the number of items in the map
pub fn len(&self) -> usize {
self.vals.len()
}
}
impl<K, V> CRDT for LWWMap<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
for (k, ts2, v2) in other.vals.iter() {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
Ok(i) => {
let (_, ts1, _v1) = &self.vals[i];
if ts2 > ts1 {
self.vals[i].1 = *ts2;
self.vals[i].2 = v2.clone();
} else if ts1 == ts2 {
self.vals[i].2.merge(&v2);
}
}
Err(i) => {
self.vals.insert(i, (k.clone(), *ts2, v2.clone()));
}
}
}
}
}

83
src/table/crdt/map.rs Normal file
View file

@ -0,0 +1,83 @@
use serde::{Deserialize, Serialize};
use crate::crdt::crdt::*;
/// Simple CRDT Map
///
/// This types defines a CRDT for a map from keys to values. Values are CRDT types which
/// can have their own updating logic.
///
/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
/// such that two values can be compared for equality based on their hashes). As a consequence,
/// insertions take `O(n)` time. This means that Map should be used for reasonably small maps.
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct Map<K, V> {
vals: Vec<(K, V)>,
}
impl<K, V> Map<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
/// Create a new empty map CRDT
pub fn new() -> Self {
Self { vals: vec![] }
}
/// Returns a map that contains a single mapping from the specified key to the specified value.
/// This can be used to build a delta-mutator:
/// when merged with another map, the value will be added or CRDT-merged if a previous
/// value already exists.
pub fn put_mutator(k: K, v: V) -> Self {
Self { vals: vec![(k, v)] }
}
pub fn put(&mut self, k: K, v: V) {
self.merge(&Self::put_mutator(k, v));
}
/// Removes all values from the map
pub fn clear(&mut self) {
self.vals.clear();
}
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _)| k2.cmp(&k)) {
Ok(i) => Some(&self.vals[i].1),
Err(_) => None,
}
}
/// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
pub fn items(&self) -> &[(K, V)] {
&self.vals[..]
}
/// Returns the number of items in the map
pub fn len(&self) -> usize {
self.vals.len()
}
}
impl<K, V> CRDT for Map<K, V>
where
K: Clone + Ord,
V: Clone + CRDT,
{
fn merge(&mut self, other: &Self) {
for (k, v2) in other.vals.iter() {
match self.vals.binary_search_by(|(k2, _)| k2.cmp(&k)) {
Ok(i) => {
self.vals[i].1.merge(&v2);
}
Err(i) => {
self.vals.insert(i, (k.clone(), v2.clone()));
}
}
}
}
}

22
src/table/crdt/mod.rs Normal file
View file

@ -0,0 +1,22 @@
//! This package provides a simple implementation of conflict-free replicated data types (CRDTs)
//!
//! CRDTs are a type of data structures that do not require coordination. In other words, we can
//! edit them in parallel, we will always find a way to merge it.
//!
//! A general example is a counter. Its initial value is 0. Alice and Bob get a copy of the
//! counter. Alice does +1 on her copy, she reads 1. Bob does +3 on his copy, he reads 3. Now,
//! it is easy to merge their counters, order does not count: we always get 4.
//!
//! Learn more about CRDT [on Wikipedia](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type)
mod bool;
mod crdt;
mod lww;
mod lww_map;
mod map;
pub use self::bool::*;
pub use crdt::*;
pub use lww::*;
pub use lww_map::*;
pub use map::*;

240
src/table/data.rs Normal file
View file

@ -0,0 +1,240 @@
use core::borrow::Borrow;
use std::sync::Arc;
use log::warn;
use serde_bytes::ByteBuf;
use sled::Transactional;
use tokio::sync::Notify;
use garage_util::data::*;
use garage_util::error::*;
use crate::crdt::CRDT;
use crate::replication::*;
use crate::schema::*;
pub struct TableData<F: TableSchema, R: TableReplication> {
pub name: String,
pub(crate) instance: F,
pub(crate) replication: R,
pub store: sled::Tree,
pub(crate) merkle_tree: sled::Tree,
pub(crate) merkle_todo: sled::Tree,
pub(crate) merkle_todo_notify: Notify,
pub(crate) gc_todo: sled::Tree,
}
impl<F, R> TableData<F, R>
where
F: TableSchema,
R: TableReplication,
{
pub fn new(name: String, instance: F, replication: R, db: &sled::Db) -> Arc<Self> {
let store = db
.open_tree(&format!("{}:table", name))
.expect("Unable to open DB tree");
let merkle_tree = db
.open_tree(&format!("{}:merkle_tree", name))
.expect("Unable to open DB Merkle tree tree");
let merkle_todo = db
.open_tree(&format!("{}:merkle_todo", name))
.expect("Unable to open DB Merkle TODO tree");
let gc_todo = db
.open_tree(&format!("{}:gc_todo", name))
.expect("Unable to open DB tree");
Arc::new(Self {
name,
instance,
replication,
store,
merkle_tree,
merkle_todo,
merkle_todo_notify: Notify::new(),
gc_todo,
})
}
// Read functions
pub fn read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
let tree_key = self.tree_key(p, s);
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
} else {
Ok(None)
}
}
pub fn read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
// Mutation functions
// When changing this code, take care of propagating modifications correctly:
// - When an entry is modified or deleted, call the updated() function
// on the table instance
// - When an entry is modified or deleted, add it to the merkle updater's todo list.
// This has to be done atomically with the modification for the merkle updater
// to maintain consistency. The merkle updater must then be notified with todo_notify.
// - When an entry is updated to be a tombstone, add it to the gc_todo tree
pub(crate) fn update_many<T: Borrow<ByteBuf>>(&self, entries: &[T]) -> Result<(), Error> {
for update_bytes in entries.iter() {
self.update_entry(update_bytes.borrow().as_slice())?;
}
Ok(())
}
pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
let (old_entry, new_entry) = match store.get(&tree_key)? {
Some(prev_bytes) => {
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
}
None => (None, update.clone()),
};
if Some(&new_entry) != old_entry.as_ref() {
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let new_bytes_hash = blake2sum(&new_bytes[..]);
mkl_todo.insert(tree_key.clone(), new_bytes_hash.as_slice())?;
store.insert(tree_key.clone(), new_bytes)?;
Ok(Some((old_entry, new_entry, new_bytes_hash)))
} else {
Ok(None)
}
})?;
if let Some((old_entry, new_entry, new_bytes_hash)) = changed {
let is_tombstone = new_entry.is_tombstone();
self.instance.updated(old_entry, Some(new_entry));
self.merkle_todo_notify.notify_one();
if is_tombstone {
self.gc_todo.insert(&tree_key, new_bytes_hash.as_slice())?;
}
}
Ok(())
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
if let Some(cur_v) = store.get(k)? {
if cur_v == v {
store.remove(k)?;
mkl_todo.insert(k, vec![])?;
return Ok(true);
}
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.merkle_todo_notify.notify_one();
}
Ok(removed)
}
pub(crate) fn delete_if_equal_hash(
self: &Arc<Self>,
k: &[u8],
vhash: Hash,
) -> Result<bool, Error> {
let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
if let Some(cur_v) = store.get(k)? {
if blake2sum(&cur_v[..]) == vhash {
store.remove(k)?;
mkl_todo.insert(k, vec![])?;
return Ok(Some(cur_v));
}
}
Ok(None)
})?;
if let Some(old_v) = removed {
let old_entry = self.decode_entry(&old_v[..])?;
self.instance.updated(Some(old_entry), None);
self.merkle_todo_notify.notify_one();
Ok(true)
} else {
Ok(false)
}
}
// ---- Utility functions ----
pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
},
}
}
pub fn gc_todo_len(&self) -> usize {
self.gc_todo.len()
}
}

245
src/table/gc.rs Normal file
View file

@ -0,0 +1,245 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use futures::future::join_all;
use futures::select;
use futures_util::future::*;
use tokio::sync::watch;
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::membership::System;
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
use crate::data::*;
use crate::replication::*;
use crate::schema::*;
const TABLE_GC_BATCH_SIZE: usize = 1024;
const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
pub struct TableGC<F: TableSchema, R: TableReplication> {
system: Arc<System>,
data: Arc<TableData<F, R>>,
rpc_client: Arc<RpcClient<GcRPC>>,
}
#[derive(Serialize, Deserialize)]
enum GcRPC {
Update(Vec<ByteBuf>),
DeleteIfEqualHash(Vec<(ByteBuf, Hash)>),
Ok,
}
impl RpcMessage for GcRPC {}
impl<F, R> TableGC<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) fn launch(
system: Arc<System>,
data: Arc<TableData<F, R>>,
rpc_server: &mut RpcServer,
) -> Arc<Self> {
let rpc_path = format!("table_{}/gc", data.name);
let rpc_client = system.rpc_client::<GcRPC>(&rpc_path);
let gc = Arc::new(Self {
system: system.clone(),
data: data.clone(),
rpc_client,
});
gc.register_handler(rpc_server, rpc_path);
let gc1 = gc.clone();
system.background.spawn_worker(
format!("GC loop for {}", data.name),
move |must_exit: watch::Receiver<bool>| gc1.gc_loop(must_exit),
);
gc
}
async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() {
match self.gc_loop_iter().await {
Ok(true) => {
// Stuff was done, loop imediately
continue;
}
Ok(false) => {
// Nothing was done, sleep for some time (below)
}
Err(e) => {
warn!("({}) Error doing GC: {}", self.data.name, e);
}
}
select! {
_ = tokio::time::sleep(Duration::from_secs(10)).fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
}
async fn gc_loop_iter(&self) -> Result<bool, Error> {
let mut entries = vec![];
let mut excluded = vec![];
for item in self.data.gc_todo.iter() {
let (k, vhash) = item?;
let vhash = Hash::try_from(&vhash[..]).unwrap();
let v_opt = self
.data
.store
.get(&k[..])?
.filter(|v| blake2sum(&v[..]) == vhash);
if let Some(v) = v_opt {
entries.push((ByteBuf::from(k.to_vec()), vhash, ByteBuf::from(v.to_vec())));
if entries.len() >= TABLE_GC_BATCH_SIZE {
break;
}
} else {
excluded.push((k, vhash));
}
}
for (k, vhash) in excluded {
self.todo_remove_if_equal(&k[..], vhash)?;
}
if entries.len() == 0 {
// Nothing to do in this iteration
return Ok(false);
}
debug!("({}) GC: doing {} items", self.data.name, entries.len());
let mut partitions = HashMap::new();
for (k, vhash, v) in entries {
let pkh = Hash::try_from(&k[..32]).unwrap();
let mut nodes = self.data.replication.write_nodes(&pkh);
nodes.retain(|x| *x != self.system.id);
nodes.sort();
if !partitions.contains_key(&nodes) {
partitions.insert(nodes.clone(), vec![]);
}
partitions.get_mut(&nodes).unwrap().push((k, vhash, v));
}
let resps = join_all(
partitions
.into_iter()
.map(|(nodes, items)| self.try_send_and_delete(nodes, items)),
)
.await;
for resp in resps {
if let Err(e) = resp {
warn!(
"({}) Unable to send and delete for GC: {}",
self.data.name, e
);
}
}
Ok(true)
}
async fn try_send_and_delete(
&self,
nodes: Vec<UUID>,
items: Vec<(ByteBuf, Hash, ByteBuf)>,
) -> Result<(), Error> {
let n_items = items.len();
let mut updates = vec![];
let mut deletes = vec![];
for (k, vhash, v) in items {
updates.push(v);
deletes.push((k, vhash));
}
self.rpc_client
.try_call_many(
&nodes[..],
GcRPC::Update(updates),
RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_GC_RPC_TIMEOUT),
)
.await?;
info!(
"({}) GC: {} items successfully pushed, will try to delete.",
self.data.name, n_items
);
self.rpc_client
.try_call_many(
&nodes[..],
GcRPC::DeleteIfEqualHash(deletes.clone()),
RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_GC_RPC_TIMEOUT),
)
.await?;
for (k, vhash) in deletes {
self.data.delete_if_equal_hash(&k[..], vhash)?;
self.todo_remove_if_equal(&k[..], vhash)?;
}
Ok(())
}
fn todo_remove_if_equal(&self, key: &[u8], vhash: Hash) -> Result<(), Error> {
let _ = self
.data
.gc_todo
.compare_and_swap::<_, _, Vec<u8>>(key, Some(vhash), None)?;
Ok(())
}
// ---- RPC HANDLER ----
fn register_handler(self: &Arc<Self>, rpc_server: &mut RpcServer, path: String) {
let self2 = self.clone();
rpc_server.add_handler::<GcRPC, _, _>(path, move |msg, _addr| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
let self2 = self.clone();
self.rpc_client
.set_local_handler(self.system.id, move |msg| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
}
async fn handle_rpc(self: &Arc<Self>, message: &GcRPC) -> Result<GcRPC, Error> {
match message {
GcRPC::Update(items) => {
self.data.update_many(items)?;
Ok(GcRPC::Ok)
}
GcRPC::DeleteIfEqualHash(items) => {
for (key, vhash) in items.iter() {
self.data.delete_if_equal_hash(&key[..], *vhash)?;
self.todo_remove_if_equal(&key[..], *vhash)?;
}
Ok(GcRPC::Ok)
}
_ => Err(Error::Message(format!("Unexpected GC RPC"))),
}
}
}

View file

@ -7,10 +7,12 @@ pub mod crdt;
pub mod schema;
pub mod util;
pub mod data;
pub mod gc;
pub mod merkle;
pub mod replication;
pub mod sync;
pub mod table;
pub mod table_fullcopy;
pub mod table_sharded;
pub mod table_sync;
pub use schema::*;
pub use table::*;

439
src/table/merkle.rs Normal file
View file

@ -0,0 +1,439 @@
use std::sync::Arc;
use std::time::Duration;
use futures::select;
use futures_util::future::*;
use log::{debug, warn};
use serde::{Deserialize, Serialize};
use sled::transaction::{
ConflictableTransactionError, ConflictableTransactionResult, TransactionalTree,
};
use tokio::sync::watch;
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::ring::*;
use crate::data::*;
use crate::replication::*;
use crate::schema::*;
// This modules partitions the data in 2**16 partitions, based on the top
// 16 bits (two bytes) of item's partition keys' hashes.
// It builds one Merkle tree for each of these 2**16 partitions.
pub struct MerkleUpdater<F: TableSchema, R: TableReplication> {
data: Arc<TableData<F, R>>,
// Content of the todo tree: items where
// - key = the key of an item in the main table, ie hash(partition_key)+sort_key
// - value = the hash of the full serialized item, if present,
// or an empty vec if item is absent (deleted)
// Fields in data:
// pub(crate) merkle_todo: sled::Tree,
// pub(crate) merkle_todo_notify: Notify,
// Content of the merkle tree: items where
// - key = .bytes() for MerkleNodeKey
// - value = serialization of a MerkleNode, assumed to be MerkleNode::empty if not found
// Field in data:
// pub(crate) merkle_tree: sled::Tree,
empty_node_hash: Hash,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MerkleNodeKey {
// partition number
pub partition: Partition,
// prefix: a prefix for the hash of full keys, i.e. hash(hash(partition_key)+sort_key)
#[serde(with = "serde_bytes")]
pub prefix: Vec<u8>,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
pub enum MerkleNode {
// The empty Merkle node
Empty,
// An intermediate Merkle tree node for a prefix
// Contains the hashes of the 256 possible next prefixes
Intermediate(Vec<(u8, Hash)>),
// A final node for an item
// Contains the full key of the item and the hash of the value
Leaf(Vec<u8>, Hash),
}
impl<F, R> MerkleUpdater<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) fn launch(background: &BackgroundRunner, data: Arc<TableData<F, R>>) -> Arc<Self> {
let empty_node_hash = blake2sum(&rmp_to_vec_all_named(&MerkleNode::Empty).unwrap()[..]);
let ret = Arc::new(Self {
data,
empty_node_hash,
});
let ret2 = ret.clone();
background.spawn_worker(
format!("Merkle tree updater for {}", ret.data.name),
|must_exit: watch::Receiver<bool>| ret2.updater_loop(must_exit),
);
ret
}
async fn updater_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() {
if let Some(x) = self.data.merkle_todo.iter().next() {
match x {
Ok((key, valhash)) => {
if let Err(e) = self.update_item(&key[..], &valhash[..]) {
warn!(
"({}) Error while updating Merkle tree item: {}",
self.data.name, e
);
}
}
Err(e) => {
warn!(
"({}) Error while iterating on Merkle todo tree: {}",
self.data.name, e
);
tokio::time::sleep(Duration::from_secs(10)).await;
}
}
} else {
select! {
_ = self.data.merkle_todo_notify.notified().fuse() => (),
_ = must_exit.changed().fuse() => (),
}
}
}
}
fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> {
let khash = blake2sum(k);
let new_vhash = if vhash_by.len() == 0 {
None
} else {
Some(Hash::try_from(&vhash_by[..]).unwrap())
};
let key = MerkleNodeKey {
partition: self
.data
.replication
.partition_of(&Hash::try_from(&k[0..32]).unwrap()),
prefix: vec![],
};
self.data
.merkle_tree
.transaction(|tx| self.update_item_rec(tx, k, khash, &key, new_vhash))?;
let deleted = self
.data
.merkle_todo
.compare_and_swap::<_, _, Vec<u8>>(k, Some(vhash_by), None)?
.is_ok();
if !deleted {
debug!(
"({}) Item not deleted from Merkle todo because it changed: {:?}",
self.data.name, k
);
}
Ok(())
}
fn update_item_rec(
&self,
tx: &TransactionalTree,
k: &[u8],
khash: Hash,
key: &MerkleNodeKey,
new_vhash: Option<Hash>,
) -> ConflictableTransactionResult<Option<Hash>, Error> {
let i = key.prefix.len();
// Read node at current position (defined by the prefix stored in key)
// Calculate an update to apply to this node
// This update is an Option<_>, so that it is None if the update is a no-op
// and we can thus skip recalculating and re-storing everything
let mutate = match self.read_node_txn(tx, &key)? {
MerkleNode::Empty => {
if let Some(vhv) = new_vhash {
Some(MerkleNode::Leaf(k.to_vec(), vhv))
} else {
// Nothing to do, keep empty node
None
}
}
MerkleNode::Intermediate(mut children) => {
let key2 = key.next_key(khash);
if let Some(subhash) = self.update_item_rec(tx, k, khash, &key2, new_vhash)? {
// Subtree changed, update this node as well
if subhash == self.empty_node_hash {
intermediate_rm_child(&mut children, key2.prefix[i]);
} else {
intermediate_set_child(&mut children, key2.prefix[i], subhash);
}
if children.len() == 0 {
// should not happen
warn!(
"({}) Replacing intermediate node with empty node, should not happen.",
self.data.name
);
Some(MerkleNode::Empty)
} else if children.len() == 1 {
// We now have a single node (case when the update deleted one of only two
// children). Move that single child to this level of the tree.
let key_sub = key.add_byte(children[0].0);
let subnode = self.read_node_txn(tx, &key_sub)?;
tx.remove(key_sub.encode())?;
Some(subnode)
} else {
Some(MerkleNode::Intermediate(children))
}
} else {
// Subtree not changed, nothing to do
None
}
}
MerkleNode::Leaf(exlf_key, exlf_hash) => {
if exlf_key == k {
// This leaf is for the same key that the one we are updating
match new_vhash {
Some(vhv) if vhv == exlf_hash => None,
Some(vhv) => Some(MerkleNode::Leaf(k.to_vec(), vhv)),
None => Some(MerkleNode::Empty),
}
} else {
// This is an only leaf for another key
if let Some(vhv) = new_vhash {
// Move that other key to a subnode, create another subnode for our
// insertion and replace current node by an intermediary node
let (pos1, h1) = {
let key2 = key.next_key(blake2sum(&exlf_key[..]));
let subhash = self.put_node_txn(
tx,
&key2,
&MerkleNode::Leaf(exlf_key, exlf_hash),
)?;
(key2.prefix[i], subhash)
};
let (pos2, h2) = {
let key2 = key.next_key(khash);
let subhash =
self.put_node_txn(tx, &key2, &MerkleNode::Leaf(k.to_vec(), vhv))?;
(key2.prefix[i], subhash)
};
let mut int = vec![];
intermediate_set_child(&mut int, pos1, h1);
intermediate_set_child(&mut int, pos2, h2);
Some(MerkleNode::Intermediate(int))
} else {
// Nothing to do, we don't want to insert this value because it is None,
// and we don't want to change the other value because it's for something
// else
None
}
}
}
};
if let Some(new_node) = mutate {
let hash = self.put_node_txn(tx, &key, &new_node)?;
Ok(Some(hash))
} else {
Ok(None)
}
}
// Merkle tree node manipulation
fn read_node_txn(
&self,
tx: &TransactionalTree,
k: &MerkleNodeKey,
) -> ConflictableTransactionResult<MerkleNode, Error> {
let ent = tx.get(k.encode())?;
match ent {
None => Ok(MerkleNode::Empty),
Some(v) => Ok(rmp_serde::decode::from_read_ref::<_, MerkleNode>(&v[..])
.map_err(|e| ConflictableTransactionError::Abort(e.into()))?),
}
}
fn put_node_txn(
&self,
tx: &TransactionalTree,
k: &MerkleNodeKey,
v: &MerkleNode,
) -> ConflictableTransactionResult<Hash, Error> {
trace!("Put Merkle node: {:?} => {:?}", k, v);
if *v == MerkleNode::Empty {
tx.remove(k.encode())?;
Ok(self.empty_node_hash)
} else {
let vby = rmp_to_vec_all_named(v)
.map_err(|e| ConflictableTransactionError::Abort(e.into()))?;
let rethash = blake2sum(&vby[..]);
tx.insert(k.encode(), vby)?;
Ok(rethash)
}
}
// Access a node in the Merkle tree, used by the sync protocol
pub(crate) fn read_node(&self, k: &MerkleNodeKey) -> Result<MerkleNode, Error> {
let ent = self.data.merkle_tree.get(k.encode())?;
match ent {
None => Ok(MerkleNode::Empty),
Some(v) => Ok(rmp_serde::decode::from_read_ref::<_, MerkleNode>(&v[..])?),
}
}
pub fn merkle_tree_len(&self) -> usize {
self.data.merkle_tree.len()
}
pub fn todo_len(&self) -> usize {
self.data.merkle_todo.len()
}
}
impl MerkleNodeKey {
fn encode(&self) -> Vec<u8> {
let mut ret = Vec::with_capacity(2 + self.prefix.len());
ret.extend(&u16::to_be_bytes(self.partition)[..]);
ret.extend(&self.prefix[..]);
ret
}
pub fn next_key(&self, h: Hash) -> Self {
assert!(&h.as_slice()[0..self.prefix.len()] == &self.prefix[..]);
let mut s2 = self.clone();
s2.prefix.push(h.as_slice()[self.prefix.len()]);
s2
}
pub fn add_byte(&self, b: u8) -> Self {
let mut s2 = self.clone();
s2.prefix.push(b);
s2
}
}
fn intermediate_set_child(ch: &mut Vec<(u8, Hash)>, pos: u8, v: Hash) {
for i in 0..ch.len() {
if ch[i].0 == pos {
ch[i].1 = v;
return;
} else if ch[i].0 > pos {
ch.insert(i, (pos, v));
return;
}
}
ch.insert(ch.len(), (pos, v));
}
fn intermediate_rm_child(ch: &mut Vec<(u8, Hash)>, pos: u8) {
for i in 0..ch.len() {
if ch[i].0 == pos {
ch.remove(i);
return;
}
}
}
#[test]
fn test_intermediate_aux() {
let mut v = vec![];
intermediate_set_child(&mut v, 12u8, [12u8; 32].into());
assert_eq!(v, vec![(12u8, [12u8; 32].into())]);
intermediate_set_child(&mut v, 42u8, [42u8; 32].into());
assert_eq!(
v,
vec![(12u8, [12u8; 32].into()), (42u8, [42u8; 32].into())]
);
intermediate_set_child(&mut v, 4u8, [4u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(12u8, [12u8; 32].into()),
(42u8, [42u8; 32].into())
]
);
intermediate_set_child(&mut v, 12u8, [8u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(12u8, [8u8; 32].into()),
(42u8, [42u8; 32].into())
]
);
intermediate_set_child(&mut v, 6u8, [6u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [6u8; 32].into()),
(12u8, [8u8; 32].into()),
(42u8, [42u8; 32].into())
]
);
intermediate_rm_child(&mut v, 42u8);
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [6u8; 32].into()),
(12u8, [8u8; 32].into())
]
);
intermediate_rm_child(&mut v, 11u8);
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [6u8; 32].into()),
(12u8, [8u8; 32].into())
]
);
intermediate_rm_child(&mut v, 6u8);
assert_eq!(v, vec![(4u8, [4u8; 32].into()), (12u8, [8u8; 32].into())]);
intermediate_set_child(&mut v, 6u8, [7u8; 32].into());
assert_eq!(
v,
vec![
(4u8, [4u8; 32].into()),
(6u8, [7u8; 32].into()),
(12u8, [8u8; 32].into())
]
);
}
impl MerkleNode {
pub fn is_empty(&self) -> bool {
*self == MerkleNode::Empty
}
}

View file

@ -0,0 +1,51 @@
use std::sync::Arc;
use garage_rpc::membership::System;
use garage_rpc::ring::*;
use garage_util::data::*;
use crate::replication::*;
#[derive(Clone)]
pub struct TableFullReplication {
pub system: Arc<System>,
pub max_faults: usize,
}
impl TableReplication for TableFullReplication {
// Full replication schema: all nodes store everything
// Writes are disseminated in an epidemic manner in the network
// Advantage: do all reads locally, extremely fast
// Inconvenient: only suitable to reasonably small tables
fn read_nodes(&self, _hash: &Hash) -> Vec<UUID> {
vec![self.system.id]
}
fn read_quorum(&self) -> usize {
1
}
fn write_nodes(&self, _hash: &Hash) -> Vec<UUID> {
let ring = self.system.ring.borrow();
ring.config.members.keys().cloned().collect::<Vec<_>>()
}
fn write_quorum(&self) -> usize {
let nmembers = self.system.ring.borrow().config.members.len();
if nmembers > self.max_faults {
nmembers - self.max_faults
} else {
1
}
}
fn max_write_errors(&self) -> usize {
self.max_faults
}
fn partition_of(&self, _hash: &Hash) -> Partition {
0u16
}
fn partitions(&self) -> Vec<(Partition, Hash)> {
vec![(0u16, [0u8; 32].into())]
}
}

View file

@ -0,0 +1,6 @@
mod parameters;
pub mod fullcopy;
pub mod sharded;
pub use parameters::*;

View file

@ -0,0 +1,21 @@
use garage_rpc::ring::*;
use garage_util::data::*;
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash) -> Vec<UUID>;
fn write_quorum(&self) -> usize;
fn max_write_errors(&self) -> usize;
// Accessing partitions, for Merkle tree & sync
fn partition_of(&self, hash: &Hash) -> Partition;
fn partitions(&self) -> Vec<(Partition, Hash)>;
}

View file

@ -1,11 +1,14 @@
use std::sync::Arc;
use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_rpc::ring::*;
use garage_util::data::*;
use crate::*;
use crate::replication::*;
#[derive(Clone)]
pub struct TableShardedReplication {
pub system: Arc<System>,
pub replication_factor: usize,
pub read_quorum: usize,
pub write_quorum: usize,
@ -19,35 +22,29 @@ impl TableReplication for TableShardedReplication {
// - reads are done on all of the nodes that replicate the data
// - writes as well
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
let ring = system.ring.borrow().clone();
fn read_nodes(&self, hash: &Hash) -> Vec<UUID> {
let ring = self.system.ring.borrow().clone();
ring.walk_ring(&hash, self.replication_factor)
}
fn read_quorum(&self) -> usize {
self.read_quorum
}
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
let ring = system.ring.borrow().clone();
fn write_nodes(&self, hash: &Hash) -> Vec<UUID> {
let ring = self.system.ring.borrow();
ring.walk_ring(&hash, self.replication_factor)
}
fn write_quorum(&self, _system: &System) -> usize {
fn write_quorum(&self) -> usize {
self.write_quorum
}
fn max_write_errors(&self) -> usize {
self.replication_factor - self.write_quorum
}
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID> {
ring.walk_ring(&hash, self.replication_factor)
fn partition_of(&self, hash: &Hash) -> Partition {
self.system.ring.borrow().partition_of(hash)
}
fn split_points(&self, ring: &Ring) -> Vec<Hash> {
let mut ret = vec![];
for entry in ring.ring.iter() {
ret.push(entry.location);
}
ret.push([0xFFu8; 32].into());
ret
fn partitions(&self) -> Vec<(Partition, Hash)> {
self.system.ring.borrow().partitions()
}
}

View file

@ -2,13 +2,15 @@ use serde::{Deserialize, Serialize};
use garage_util::data::*;
use crate::crdt::CRDT;
pub trait PartitionKey {
fn hash(&self) -> Hash;
}
impl PartitionKey for String {
fn hash(&self) -> Hash {
sha256sum(self.as_bytes())
blake2sum(self.as_bytes())
}
}
@ -35,12 +37,14 @@ impl SortKey for Hash {
}
pub trait Entry<P: PartitionKey, S: SortKey>:
PartialEq + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync
CRDT + PartialEq + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync
{
fn partition_key(&self) -> &P;
fn sort_key(&self) -> &S;
fn merge(&mut self, other: &Self);
fn is_tombstone(&self) -> bool {
false
}
}
pub trait TableSchema: Send + Sync {

614
src/table/sync.rs Normal file
View file

@ -0,0 +1,614 @@
use std::collections::VecDeque;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use futures::select;
use futures_util::future::*;
use futures_util::stream::*;
use rand::Rng;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use tokio::sync::{mpsc, watch};
use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::membership::System;
use garage_rpc::ring::*;
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
use crate::data::*;
use crate::merkle::*;
use crate::replication::*;
use crate::*;
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
// Do anti-entropy every 10 minutes
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
system: Arc<System>,
data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>,
todo: Mutex<SyncTodo>,
rpc_client: Arc<RpcClient<SyncRPC>>,
}
#[derive(Serialize, Deserialize)]
pub(crate) enum SyncRPC {
RootCkHash(Partition, Hash),
RootCkDifferent(bool),
GetNode(MerkleNodeKey),
Node(MerkleNodeKey, MerkleNode),
Items(Vec<Arc<ByteBuf>>),
Ok,
}
impl RpcMessage for SyncRPC {}
struct SyncTodo {
todo: Vec<TodoPartition>,
}
#[derive(Debug, Clone)]
struct TodoPartition {
partition: Partition,
begin: Hash,
end: Hash,
// Are we a node that stores this partition or not?
retain: bool,
}
impl<F, R> TableSyncer<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) fn launch(
system: Arc<System>,
data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>,
rpc_server: &mut RpcServer,
) -> Arc<Self> {
let rpc_path = format!("table_{}/sync", data.name);
let rpc_client = system.rpc_client::<SyncRPC>(&rpc_path);
let todo = SyncTodo { todo: vec![] };
let syncer = Arc::new(Self {
system: system.clone(),
data: data.clone(),
merkle,
todo: Mutex::new(todo),
rpc_client,
});
syncer.register_handler(rpc_server, rpc_path);
let (busy_tx, busy_rx) = mpsc::unbounded_channel();
let s1 = syncer.clone();
system.background.spawn_worker(
format!("table sync watcher for {}", data.name),
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
);
let s2 = syncer.clone();
system.background.spawn_worker(
format!("table syncer for {}", data.name),
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
);
let s3 = syncer.clone();
tokio::spawn(async move {
tokio::time::sleep(Duration::from_secs(20)).await;
s3.add_full_sync();
});
syncer
}
fn register_handler(self: &Arc<Self>, rpc_server: &mut RpcServer, path: String) {
let self2 = self.clone();
rpc_server.add_handler::<SyncRPC, _, _>(path, move |msg, _addr| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
let self2 = self.clone();
self.rpc_client
.set_local_handler(self.system.id, move |msg| {
let self2 = self2.clone();
async move { self2.handle_rpc(&msg).await }
});
}
async fn watcher_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
mut busy_rx: mpsc::UnboundedReceiver<bool>,
) {
let mut prev_ring: Arc<Ring> = self.system.ring.borrow().clone();
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.system.ring.clone();
let mut nothing_to_do_since = Some(Instant::now());
while !*must_exit.borrow() {
select! {
_ = ring_recv.changed().fuse() => {
let new_ring = ring_recv.borrow();
if !Arc::ptr_eq(&new_ring, &prev_ring) {
debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name);
self.add_full_sync();
prev_ring = new_ring.clone();
}
}
busy_opt = busy_rx.recv().fuse() => {
if let Some(busy) = busy_opt {
if busy {
nothing_to_do_since = None;
} else {
if nothing_to_do_since.is_none() {
nothing_to_do_since = Some(Instant::now());
}
}
}
}
_ = must_exit.changed().fuse() => (),
_ = tokio::time::sleep(Duration::from_secs(1)).fuse() => {
if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) {
nothing_to_do_since = None;
debug!("({}) Interval passed, adding full sync to syncer todo list", self.data.name);
self.add_full_sync();
}
}
}
}
}
pub fn add_full_sync(&self) {
self.todo
.lock()
.unwrap()
.add_full_sync(&self.data, &self.system);
}
async fn syncer_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
busy_tx: mpsc::UnboundedSender<bool>,
) {
while !*must_exit.borrow() {
let task = self.todo.lock().unwrap().pop_task();
if let Some(partition) = task {
busy_tx.send(true).unwrap();
let res = self
.clone()
.sync_partition(&partition, &mut must_exit)
.await;
if let Err(e) = res {
warn!(
"({}) Error while syncing {:?}: {}",
self.data.name, partition, e
);
}
} else {
busy_tx.send(false).unwrap();
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
}
async fn sync_partition(
self: Arc<Self>,
partition: &TodoPartition,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
if partition.retain {
let my_id = self.system.id;
let nodes = self
.data
.replication
.write_nodes(&partition.begin)
.into_iter()
.filter(|node| *node != my_id)
.collect::<Vec<_>>();
debug!(
"({}) Syncing {:?} with {:?}...",
self.data.name, partition, nodes
);
let mut sync_futures = nodes
.iter()
.map(|node| {
self.clone()
.do_sync_with(partition.clone(), *node, must_exit.clone())
})
.collect::<FuturesUnordered<_>>();
let mut n_errors = 0;
while let Some(r) = sync_futures.next().await {
if let Err(e) = r {
n_errors += 1;
warn!("({}) Sync error: {}", self.data.name, e);
}
}
if n_errors > self.data.replication.max_write_errors() {
return Err(Error::Message(format!(
"Sync failed with too many nodes (should have been: {:?}).",
nodes
)));
}
} else {
self.offload_partition(&partition.begin, &partition.end, must_exit)
.await?;
}
Ok(())
}
// Offload partition: this partition is not something we are storing,
// so send it out to all other nodes that store it and delete items locally.
// We don't bother checking if the remote nodes already have the items,
// we just batch-send everything. Offloading isn't supposed to happen very often.
// If any of the nodes that are supposed to store the items is unable to
// save them, we interrupt the process.
async fn offload_partition(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
let mut counter: usize = 0;
while !*must_exit.borrow() {
let mut items = Vec::new();
for item in self.data.store.range(begin.to_vec()..end.to_vec()) {
let (key, value) = item?;
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
if items.len() >= 1024 {
break;
}
}
if items.len() > 0 {
let nodes = self
.data
.replication
.write_nodes(&begin)
.into_iter()
.collect::<Vec<_>>();
if nodes.contains(&self.system.id) {
warn!(
"({}) Interrupting offload as partitions seem to have changed",
self.data.name
);
break;
}
if nodes.len() < self.data.replication.write_quorum() {
return Err(Error::Message(format!(
"Not offloading as we don't have a quorum of nodes to write to."
)));
}
counter += 1;
info!(
"({}) Offloading {} items from {:?}..{:?} ({})",
self.data.name,
items.len(),
begin,
end,
counter
);
self.offload_items(&items, &nodes[..]).await?;
} else {
break;
}
}
Ok(())
}
async fn offload_items(
self: &Arc<Self>,
items: &Vec<(Vec<u8>, Arc<ByteBuf>)>,
nodes: &[UUID],
) -> Result<(), Error> {
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
self.rpc_client
.try_call_many(
&nodes[..],
SyncRPC::Items(values),
RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_SYNC_RPC_TIMEOUT),
)
.await?;
// All remote nodes have written those items, now we can delete them locally
let mut not_removed = 0;
for (k, v) in items.iter() {
if !self.data.delete_if_equal(&k[..], &v[..])? {
not_removed += 1;
}
}
if not_removed > 0 {
debug!("({}) {} items not removed during offload because they changed in between (trying again...)", self.data.name, not_removed);
}
Ok(())
}
// ======= SYNCHRONIZATION PROCEDURE -- DRIVER SIDE ======
// The driver side is only concerned with sending out the item it has
// and the other side might not have. Receiving items that differ from one
// side to the other will happen when the other side syncs with us,
// which they also do regularly.
fn get_root_ck(&self, partition: Partition) -> Result<(MerkleNodeKey, MerkleNode), Error> {
let key = MerkleNodeKey {
partition,
prefix: vec![],
};
let node = self.merkle.read_node(&key)?;
Ok((key, node))
}
async fn do_sync_with(
self: Arc<Self>,
partition: TodoPartition,
who: UUID,
must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let (root_ck_key, root_ck) = self.get_root_ck(partition.partition)?;
if root_ck.is_empty() {
debug!(
"({}) Sync {:?} with {:?}: partition is empty.",
self.data.name, partition, who
);
return Ok(());
}
let root_ck_hash = hash_of::<MerkleNode>(&root_ck)?;
// Check if they have the same root checksum
// If so, do nothing.
let root_resp = self
.rpc_client
.call(
who,
SyncRPC::RootCkHash(partition.partition, root_ck_hash),
TABLE_SYNC_RPC_TIMEOUT,
)
.await?;
let mut todo = match root_resp {
SyncRPC::RootCkDifferent(false) => {
debug!(
"({}) Sync {:?} with {:?}: no difference",
self.data.name, partition, who
);
return Ok(());
}
SyncRPC::RootCkDifferent(true) => VecDeque::from(vec![root_ck_key]),
x => {
return Err(Error::Message(format!(
"Invalid respone to RootCkHash RPC: {}",
debug_serialize(x)
)));
}
};
let mut todo_items = vec![];
while !todo.is_empty() && !*must_exit.borrow() {
let key = todo.pop_front().unwrap();
let node = self.merkle.read_node(&key)?;
match node {
MerkleNode::Empty => {
// They have items we don't have.
// We don't request those items from them, they will send them.
// We only bother with pushing items that differ
}
MerkleNode::Leaf(ik, ivhash) => {
// Just send that item directly
if let Some(val) = self.data.store.get(&ik[..])? {
if blake2sum(&val[..]) != ivhash {
warn!("({}) Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", self.data.name, ik);
}
todo_items.push(val.to_vec());
} else {
warn!("({}) Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", self.data.name, ik);
}
}
MerkleNode::Intermediate(l) => {
// Get Merkle node for this tree position at remote node
// and compare it with local node
let remote_node = match self
.rpc_client
.call(who, SyncRPC::GetNode(key.clone()), TABLE_SYNC_RPC_TIMEOUT)
.await?
{
SyncRPC::Node(_, node) => node,
x => {
return Err(Error::Message(format!(
"Invalid respone to GetNode RPC: {}",
debug_serialize(x)
)));
}
};
let int_l2 = match remote_node {
// If they have an intermediate node at this tree position,
// we can compare them to find differences
MerkleNode::Intermediate(l2) => l2,
// Otherwise, treat it as if they have nothing for this subtree,
// which will have the consequence of sending them everything
_ => vec![],
};
let join = join_ordered(&l[..], &int_l2[..]);
for (p, v1, v2) in join.into_iter() {
let diff = match (v1, v2) {
(Some(_), None) | (None, Some(_)) => true,
(Some(a), Some(b)) => a != b,
_ => false,
};
if diff {
todo.push_back(key.add_byte(*p));
}
}
}
}
if todo_items.len() >= 256 {
self.send_items(who, std::mem::replace(&mut todo_items, vec![]))
.await?;
}
}
if !todo_items.is_empty() {
self.send_items(who, todo_items).await?;
}
Ok(())
}
async fn send_items(&self, who: UUID, item_value_list: Vec<Vec<u8>>) -> Result<(), Error> {
info!(
"({}) Sending {} items to {:?}",
self.data.name,
item_value_list.len(),
who
);
let values = item_value_list
.into_iter()
.map(|x| Arc::new(ByteBuf::from(x)))
.collect::<Vec<_>>();
let rpc_resp = self
.rpc_client
.call(who, SyncRPC::Items(values), TABLE_SYNC_RPC_TIMEOUT)
.await?;
if let SyncRPC::Ok = rpc_resp {
Ok(())
} else {
Err(Error::Message(format!(
"Unexpected response to RPC Update: {}",
debug_serialize(&rpc_resp)
)))
}
}
// ======= SYNCHRONIZATION PROCEDURE -- RECEIVER SIDE ======
async fn handle_rpc(self: &Arc<Self>, message: &SyncRPC) -> Result<SyncRPC, Error> {
match message {
SyncRPC::RootCkHash(range, h) => {
let (_root_ck_key, root_ck) = self.get_root_ck(*range)?;
let hash = hash_of::<MerkleNode>(&root_ck)?;
Ok(SyncRPC::RootCkDifferent(hash != *h))
}
SyncRPC::GetNode(k) => {
let node = self.merkle.read_node(&k)?;
Ok(SyncRPC::Node(k.clone(), node))
}
SyncRPC::Items(items) => {
self.data.update_many(items)?;
Ok(SyncRPC::Ok)
}
_ => Err(Error::Message(format!("Unexpected sync RPC"))),
}
}
}
impl SyncTodo {
fn add_full_sync<F: TableSchema, R: TableReplication>(
&mut self,
data: &TableData<F, R>,
system: &System,
) {
let my_id = system.id;
self.todo.clear();
let partitions = data.replication.partitions();
for i in 0..partitions.len() {
let begin = partitions[i].1;
let end = if i + 1 < partitions.len() {
partitions[i + 1].1
} else {
[0xFFu8; 32].into()
};
let nodes = data.replication.write_nodes(&begin);
let retain = nodes.contains(&my_id);
if !retain {
// Check if we have some data to send, otherwise skip
if data.store.range(begin..end).next().is_none() {
continue;
}
}
self.todo.push(TodoPartition {
partition: partitions[i].0,
begin,
end,
retain,
});
}
}
fn pop_task(&mut self) -> Option<TodoPartition> {
if self.todo.is_empty() {
return None;
}
let i = rand::thread_rng().gen_range(0..self.todo.len());
if i == self.todo.len() - 1 {
self.todo.pop()
} else {
let replacement = self.todo.pop().unwrap();
let ret = std::mem::replace(&mut self.todo[i], replacement);
Some(ret)
}
}
}
fn hash_of<T: Serialize>(x: &T) -> Result<Hash, Error> {
Ok(blake2sum(&rmp_to_vec_all_named(x)?[..]))
}
fn join_ordered<'a, K: Ord + Eq, V1, V2>(
x: &'a [(K, V1)],
y: &'a [(K, V2)],
) -> Vec<(&'a K, Option<&'a V1>, Option<&'a V2>)> {
let mut ret = vec![];
let mut i = 0;
let mut j = 0;
while i < x.len() || j < y.len() {
if i < x.len() && j < y.len() && x[i].0 == y[j].0 {
ret.push((&x[i].0, Some(&x[i].1), Some(&y[j].1)));
i += 1;
j += 1;
} else if i < x.len() && (j == y.len() || x[i].0 < y[j].0) {
ret.push((&x[i].0, Some(&x[i].1), None));
i += 1;
} else if j < y.len() && (i == x.len() || x[i].0 > y[j].0) {
ret.push((&y[j].0, None, Some(&y[j].1)));
j += 1;
} else {
unreachable!();
}
}
ret
}

View file

@ -2,9 +2,6 @@ use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use std::time::Duration;
use log::warn;
use arc_swap::ArcSwapOption;
use futures::stream::*;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
@ -13,25 +10,25 @@ use garage_util::data::*;
use garage_util::error::Error;
use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_rpc::rpc_client::*;
use garage_rpc::rpc_server::*;
use crate::crdt::CRDT;
use crate::data::*;
use crate::gc::*;
use crate::merkle::*;
use crate::replication::*;
use crate::schema::*;
use crate::table_sync::*;
use crate::sync::*;
const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
pub struct Table<F: TableSchema, R: TableReplication> {
pub instance: F,
pub replication: R,
pub name: String,
pub(crate) rpc_client: Arc<RpcClient<TableRPC<F>>>,
pub system: Arc<System>,
pub store: sled::Tree,
pub syncer: ArcSwapOption<TableSyncer<F, R>>,
pub data: Arc<TableData<F, R>>,
pub merkle_updater: Arc<MerkleUpdater<F, R>>,
pub syncer: Arc<TableSyncer<F, R>>,
rpc_client: Arc<RpcClient<TableRPC<F>>>,
}
#[derive(Serialize, Deserialize)]
@ -45,30 +42,10 @@ pub(crate) enum TableRPC<F: TableSchema> {
ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize),
Update(Vec<Arc<ByteBuf>>),
SyncRPC(SyncRPC),
}
impl<F: TableSchema> RpcMessage for TableRPC<F> {}
pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods
// Which nodes to send reads from
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn read_quorum(&self) -> usize;
// Which nodes to send writes to
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>;
fn write_quorum(&self, system: &System) -> usize;
fn max_write_errors(&self) -> usize;
// Which are the nodes that do actually replicate the data
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
fn split_points(&self, ring: &Ring) -> Vec<Hash>;
}
impl<F, R> Table<F, R>
where
F: TableSchema + 'static,
@ -76,7 +53,7 @@ where
{
// =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) ===============
pub async fn new(
pub fn new(
instance: F,
replication: R,
system: Arc<System>,
@ -84,31 +61,37 @@ where
name: String,
rpc_server: &mut RpcServer,
) -> Arc<Self> {
let store = db.open_tree(&name).expect("Unable to open DB tree");
let rpc_path = format!("table_{}", name);
let rpc_client = system.rpc_client::<TableRPC<F>>(&rpc_path);
let table = Arc::new(Self {
instance,
replication,
name,
rpc_client,
system,
store,
syncer: ArcSwapOption::from(None),
});
table.clone().register_handler(rpc_server, rpc_path);
let data = TableData::new(name, instance, replication, db);
let syncer = TableSyncer::launch(table.clone()).await;
table.syncer.swap(Some(syncer));
let merkle_updater = MerkleUpdater::launch(&system.background, data.clone());
let syncer = TableSyncer::launch(
system.clone(),
data.clone(),
merkle_updater.clone(),
rpc_server,
);
TableGC::launch(system.clone(), data.clone(), rpc_server);
let table = Arc::new(Self {
system,
data,
merkle_updater,
syncer,
rpc_client,
});
table.clone().register_handler(rpc_server, rpc_path);
table
}
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system);
let who = self.data.replication.write_nodes(&hash);
//eprintln!("insert who: {:?}", who);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
@ -118,7 +101,7 @@ where
.try_call_many(
&who[..],
rpc,
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system))
RequestStrategy::with_quorum(self.data.replication.write_quorum())
.with_timeout(TABLE_RPC_TIMEOUT),
)
.await?;
@ -130,7 +113,7 @@ where
for entry in entries.iter() {
let hash = entry.partition_key().hash();
let who = self.replication.write_nodes(&hash, &self.system);
let who = self.data.replication.write_nodes(&hash);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
for node in who {
if !call_list.contains_key(&node) {
@ -154,7 +137,7 @@ where
errors.push(e);
}
}
if errors.len() > self.replication.max_write_errors() {
if errors.len() > self.data.replication.max_write_errors() {
Err(Error::Message("Too many errors".into()))
} else {
Ok(())
@ -167,7 +150,7 @@ where
sort_key: &F::S,
) -> Result<Option<F::E>, Error> {
let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system);
let who = self.data.replication.read_nodes(&hash);
//eprintln!("get who: {:?}", who);
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
@ -176,7 +159,7 @@ where
.try_call_many(
&who[..],
rpc,
RequestStrategy::with_quorum(self.replication.read_quorum())
RequestStrategy::with_quorum(self.data.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
@ -187,7 +170,7 @@ where
for resp in resps {
if let TableRPC::ReadEntryResponse(value) = resp {
if let Some(v_bytes) = value {
let v = self.decode_entry(v_bytes.as_slice())?;
let v = self.data.decode_entry(v_bytes.as_slice())?;
ret = match ret {
None => Some(v),
Some(mut x) => {
@ -223,7 +206,7 @@ where
limit: usize,
) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash();
let who = self.replication.read_nodes(&hash, &self.system);
let who = self.data.replication.read_nodes(&hash);
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
@ -232,7 +215,7 @@ where
.try_call_many(
&who[..],
rpc,
RequestStrategy::with_quorum(self.replication.read_quorum())
RequestStrategy::with_quorum(self.data.replication.read_quorum())
.with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
@ -243,8 +226,8 @@ where
for resp in resps {
if let TableRPC::Update(entries) = resp {
for entry_bytes in entries.iter() {
let entry = self.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.tree_key(entry.partition_key(), entry.sort_key());
let entry = self.data.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
match ret.remove(&entry_key) {
None => {
ret.insert(entry_key, Some(entry));
@ -313,146 +296,18 @@ where
async fn handle(self: &Arc<Self>, msg: &TableRPC<F>) -> Result<TableRPC<F>, Error> {
match msg {
TableRPC::ReadEntry(key, sort_key) => {
let value = self.handle_read_entry(key, sort_key)?;
let value = self.data.read_entry(key, sort_key)?;
Ok(TableRPC::ReadEntryResponse(value))
}
TableRPC::ReadRange(key, begin_sort_key, filter, limit) => {
let values = self.handle_read_range(key, begin_sort_key, filter, *limit)?;
let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
Ok(TableRPC::Update(values))
}
TableRPC::Update(pairs) => {
self.handle_update(pairs).await?;
self.data.update_many(pairs)?;
Ok(TableRPC::Ok)
}
TableRPC::SyncRPC(rpc) => {
let syncer = self.syncer.load_full().unwrap();
let response = syncer
.handle_rpc(rpc, self.system.background.stop_signal.clone())
.await?;
Ok(TableRPC::SyncRPC(response))
}
_ => Err(Error::BadRPC(format!("Unexpected table RPC"))),
}
}
fn handle_read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
let tree_key = self.tree_key(p, s);
if let Some(bytes) = self.store.get(&tree_key)? {
Ok(Some(ByteBuf::from(bytes.to_vec())))
} else {
Ok(None)
}
}
fn handle_read_range(
&self,
p: &F::P,
s: &Option<F::S>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
let partition_hash = p.hash();
let first_key = match s {
None => partition_hash.to_vec(),
Some(sk) => self.tree_key(p, sk),
};
let mut ret = vec![];
for item in self.store.range(first_key..) {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
}
let keep = match filter {
None => true,
Some(f) => {
let entry = self.decode_entry(value.as_ref())?;
F::matches_filter(&entry, f)
}
};
if keep {
ret.push(Arc::new(ByteBuf::from(value.as_ref())));
}
if ret.len() >= limit {
break;
}
}
Ok(ret)
}
pub async fn handle_update(self: &Arc<Self>, entries: &[Arc<ByteBuf>]) -> Result<(), Error> {
let syncer = self.syncer.load_full().unwrap();
for update_bytes in entries.iter() {
let update = self.decode_entry(update_bytes.as_slice())?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
let (old_entry, new_entry) = self.store.transaction(|db| {
let (old_entry, new_entry) = match db.get(&tree_key)? {
Some(prev_bytes) => {
let old_entry = self
.decode_entry(&prev_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
let mut new_entry = old_entry.clone();
new_entry.merge(&update);
(Some(old_entry), new_entry)
}
None => (None, update.clone()),
};
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RMPEncode)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
db.insert(tree_key.clone(), new_bytes)?;
Ok((old_entry, new_entry))
})?;
if old_entry.as_ref() != Some(&new_entry) {
self.instance.updated(old_entry, Some(new_entry));
syncer.invalidate(&tree_key[..]);
}
}
Ok(())
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
let removed = self.store.transaction(|txn| {
if let Some(cur_v) = txn.get(k)? {
if cur_v == v {
txn.remove(k)?;
return Ok(true);
}
}
Ok(false)
})?;
if removed {
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.syncer.load_full().unwrap().invalidate(k);
}
Ok(removed)
}
fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
Some(x) => Ok(x),
None => {
warn!("Unable to decode entry of {}: {}", self.name, e);
for line in hexdump::hexdump_iter(bytes) {
debug!("{}", line);
}
Err(e.into())
}
},
}
}
}

View file

@ -1,59 +0,0 @@
use std::sync::Arc;
use garage_rpc::membership::System;
use garage_rpc::ring::Ring;
use garage_util::data::*;
use crate::*;
#[derive(Clone)]
pub struct TableFullReplication {
pub max_faults: usize,
}
#[derive(Clone)]
struct Neighbors {
ring: Arc<Ring>,
neighbors: Vec<UUID>,
}
impl TableFullReplication {
pub fn new(max_faults: usize) -> Self {
TableFullReplication { max_faults }
}
}
impl TableReplication for TableFullReplication {
// Full replication schema: all nodes store everything
// Writes are disseminated in an epidemic manner in the network
// Advantage: do all reads locally, extremely fast
// Inconvenient: only suitable to reasonably small tables
fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec<UUID> {
vec![system.id]
}
fn read_quorum(&self) -> usize {
1
}
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> {
self.replication_nodes(hash, system.ring.borrow().as_ref())
}
fn write_quorum(&self, system: &System) -> usize {
system.ring.borrow().config.members.len() - self.max_faults
}
fn max_write_errors(&self) -> usize {
self.max_faults
}
fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec<UUID> {
ring.config.members.keys().cloned().collect::<Vec<_>>()
}
fn split_points(&self, _ring: &Ring) -> Vec<Hash> {
let mut ret = vec![];
ret.push([0u8; 32].into());
ret.push([0xFFu8; 32].into());
ret
}
}

View file

@ -1,891 +0,0 @@
use rand::Rng;
use std::collections::{BTreeMap, VecDeque};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use futures::future::join_all;
use futures::{pin_mut, select};
use futures_util::future::*;
use futures_util::stream::*;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use tokio::sync::{mpsc, watch};
use garage_rpc::ring::Ring;
use garage_util::data::*;
use garage_util::error::Error;
use crate::*;
const MAX_DEPTH: usize = 16;
const SCAN_INTERVAL: Duration = Duration::from_secs(3600);
const CHECKSUM_CACHE_TIMEOUT: Duration = Duration::from_secs(1800);
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
table: Arc<Table<F, R>>,
todo: Mutex<SyncTodo>,
cache: Vec<Mutex<BTreeMap<SyncRange, RangeChecksumCache>>>,
}
#[derive(Serialize, Deserialize)]
pub(crate) enum SyncRPC {
GetRootChecksumRange(Hash, Hash),
RootChecksumRange(SyncRange),
Checksums(Vec<RangeChecksum>),
Difference(Vec<SyncRange>, Vec<Arc<ByteBuf>>),
}
struct SyncTodo {
todo: Vec<TodoPartition>,
}
#[derive(Debug, Clone)]
struct TodoPartition {
// Partition consists in hashes between begin included and end excluded
begin: Hash,
end: Hash,
// Are we a node that stores this partition or not?
retain: bool,
}
// A SyncRange defines a query on the dataset stored by a node, in the following way:
// - all items whose key are >= `begin`
// - stopping at the first item whose key hash has at least `level` leading zero bytes (excluded)
// - except if the first item of the range has such many leading zero bytes
// - and stopping at `end` (excluded) if such an item is not found
// The checksum itself does not store all of the items in the database, only the hashes of the "sub-ranges"
// i.e. of ranges of level `level-1` that cover the same range
// (ranges of level 0 do not exist and their hash is simply the hash of the first item >= begin)
// See RangeChecksum for the struct that stores this information.
#[derive(Hash, PartialEq, Eq, Debug, Clone, Serialize, Deserialize)]
pub(crate) struct SyncRange {
begin: Vec<u8>,
end: Vec<u8>,
level: usize,
}
impl std::cmp::PartialOrd for SyncRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for SyncRange {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.begin
.cmp(&other.begin)
.then(self.level.cmp(&other.level))
.then(self.end.cmp(&other.end))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct RangeChecksum {
bounds: SyncRange,
children: Vec<(SyncRange, Hash)>,
found_limit: Option<Vec<u8>>,
#[serde(skip, default = "std::time::Instant::now")]
time: Instant,
}
#[derive(Debug, Clone)]
struct RangeChecksumCache {
hash: Option<Hash>, // None if no children
found_limit: Option<Vec<u8>>,
time: Instant,
}
impl<F, R> TableSyncer<F, R>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
pub(crate) async fn launch(table: Arc<Table<F, R>>) -> Arc<Self> {
let todo = SyncTodo { todo: Vec::new() };
let syncer = Arc::new(TableSyncer {
table: table.clone(),
todo: Mutex::new(todo),
cache: (0..MAX_DEPTH)
.map(|_| Mutex::new(BTreeMap::new()))
.collect::<Vec<_>>(),
});
let (busy_tx, busy_rx) = mpsc::unbounded_channel();
let s1 = syncer.clone();
table
.system
.background
.spawn_worker(
format!("table sync watcher for {}", table.name),
move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
)
.await;
let s2 = syncer.clone();
table
.system
.background
.spawn_worker(
format!("table syncer for {}", table.name),
move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
)
.await;
let s3 = syncer.clone();
tokio::spawn(async move {
tokio::time::delay_for(Duration::from_secs(20)).await;
s3.add_full_scan().await;
});
syncer
}
async fn watcher_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
mut busy_rx: mpsc::UnboundedReceiver<bool>,
) -> Result<(), Error> {
let mut prev_ring: Arc<Ring> = self.table.system.ring.borrow().clone();
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.table.system.ring.clone();
let mut nothing_to_do_since = Some(Instant::now());
while !*must_exit.borrow() {
let s_ring_recv = ring_recv.recv().fuse();
let s_busy = busy_rx.recv().fuse();
let s_must_exit = must_exit.recv().fuse();
let s_timeout = tokio::time::delay_for(Duration::from_secs(1)).fuse();
pin_mut!(s_ring_recv, s_busy, s_must_exit, s_timeout);
select! {
new_ring_r = s_ring_recv => {
if let Some(new_ring) = new_ring_r {
debug!("({}) Adding ring difference to syncer todo list", self.table.name);
self.todo.lock().unwrap().add_ring_difference(&self.table, &prev_ring, &new_ring);
prev_ring = new_ring;
}
}
busy_opt = s_busy => {
if let Some(busy) = busy_opt {
if busy {
nothing_to_do_since = None;
} else {
if nothing_to_do_since.is_none() {
nothing_to_do_since = Some(Instant::now());
}
}
}
}
must_exit_v = s_must_exit => {
if must_exit_v.unwrap_or(false) {
break;
}
}
_ = s_timeout => {
if nothing_to_do_since.map(|t| Instant::now() - t >= SCAN_INTERVAL).unwrap_or(false) {
nothing_to_do_since = None;
debug!("({}) Adding full scan to syncer todo list", self.table.name);
self.add_full_scan().await;
}
}
}
}
Ok(())
}
pub async fn add_full_scan(&self) {
self.todo.lock().unwrap().add_full_scan(&self.table);
}
async fn syncer_task(
self: Arc<Self>,
mut must_exit: watch::Receiver<bool>,
busy_tx: mpsc::UnboundedSender<bool>,
) -> Result<(), Error> {
while !*must_exit.borrow() {
let task = self.todo.lock().unwrap().pop_task();
if let Some(partition) = task {
busy_tx.send(true)?;
let res = self
.clone()
.sync_partition(&partition, &mut must_exit)
.await;
if let Err(e) = res {
warn!(
"({}) Error while syncing {:?}: {}",
self.table.name, partition, e
);
}
} else {
busy_tx.send(false)?;
tokio::time::delay_for(Duration::from_secs(1)).await;
}
}
Ok(())
}
async fn sync_partition(
self: Arc<Self>,
partition: &TodoPartition,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
if partition.retain {
let my_id = self.table.system.id;
let nodes = self
.table
.replication
.write_nodes(&partition.begin, &self.table.system)
.into_iter()
.filter(|node| *node != my_id)
.collect::<Vec<_>>();
debug!(
"({}) Preparing to sync {:?} with {:?}...",
self.table.name, partition, nodes
);
let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
let mut sync_futures = nodes
.iter()
.map(|node| {
self.clone().do_sync_with(
partition.clone(),
root_cks.clone(),
*node,
must_exit.clone(),
)
})
.collect::<FuturesUnordered<_>>();
let mut n_errors = 0;
while let Some(r) = sync_futures.next().await {
if let Err(e) = r {
n_errors += 1;
warn!("({}) Sync error: {}", self.table.name, e);
}
}
if n_errors > self.table.replication.max_write_errors() {
return Err(Error::Message(format!(
"Sync failed with too many nodes (should have been: {:?}).",
nodes
)));
}
} else {
self.offload_partition(&partition.begin, &partition.end, must_exit)
.await?;
}
Ok(())
}
// Offload partition: this partition is not something we are storing,
// so send it out to all other nodes that store it and delete items locally.
// We don't bother checking if the remote nodes already have the items,
// we just batch-send everything. Offloading isn't supposed to happen very often.
// If any of the nodes that are supposed to store the items is unable to
// save them, we interrupt the process.
async fn offload_partition(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
let mut counter: usize = 0;
while !*must_exit.borrow() {
let mut items = Vec::new();
for item in self.table.store.range(begin.to_vec()..end.to_vec()) {
let (key, value) = item?;
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
if items.len() >= 1024 {
break;
}
}
if items.len() > 0 {
let nodes = self
.table
.replication
.write_nodes(&begin, &self.table.system)
.into_iter()
.collect::<Vec<_>>();
if nodes.contains(&self.table.system.id) {
warn!("Interrupting offload as partitions seem to have changed");
break;
}
counter += 1;
debug!(
"Offloading {} items from {:?}..{:?} ({})",
items.len(),
begin,
end,
counter
);
self.offload_items(&items, &nodes[..]).await?;
} else {
break;
}
}
Ok(())
}
async fn offload_items(
self: &Arc<Self>,
items: &Vec<(Vec<u8>, Arc<ByteBuf>)>,
nodes: &[UUID],
) -> Result<(), Error> {
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
let update_msg = Arc::new(TableRPC::<F>::Update(values));
for res in join_all(nodes.iter().map(|to| {
self.table
.rpc_client
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
}))
.await
{
res?;
}
// All remote nodes have written those items, now we can delete them locally
let mut not_removed = 0;
for (k, v) in items.iter() {
if !self.table.delete_if_equal(&k[..], &v[..])? {
not_removed += 1;
}
}
if not_removed > 0 {
debug!("{} items not removed during offload because they changed in between (trying again...)", not_removed);
}
Ok(())
}
fn root_checksum(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksum, Error> {
for i in 1..MAX_DEPTH {
let rc = self.range_checksum(
&SyncRange {
begin: begin.to_vec(),
end: end.to_vec(),
level: i,
},
must_exit,
)?;
if rc.found_limit.is_none() {
return Ok(rc);
}
}
Err(Error::Message(format!(
"Unable to compute root checksum (this should never happen)"
)))
}
fn range_checksum(
self: &Arc<Self>,
range: &SyncRange,
must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksum, Error> {
assert!(range.level != 0);
trace!("Call range_checksum {:?}", range);
if range.level == 1 {
let mut children = vec![];
for item in self
.table
.store
.range(range.begin.clone()..range.end.clone())
{
let (key, value) = item?;
let key_hash = blake2sum(&key[..]);
if children.len() > 0
&& key_hash.as_slice()[0..range.level]
.iter()
.all(|x| *x == 0u8)
{
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
return Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: Some(key.to_vec()),
time: Instant::now(),
});
}
let item_range = SyncRange {
begin: key.to_vec(),
end: vec![],
level: 0,
};
children.push((item_range, blake2sum(&value[..])));
}
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: None,
time: Instant::now(),
})
} else {
let mut children = vec![];
let mut sub_range = SyncRange {
begin: range.begin.clone(),
end: range.end.clone(),
level: range.level - 1,
};
let mut time = Instant::now();
while !*must_exit.borrow() {
let sub_ck = self.range_checksum_cached_hash(&sub_range, must_exit)?;
if let Some(hash) = sub_ck.hash {
children.push((sub_range.clone(), hash));
if sub_ck.time < time {
time = sub_ck.time;
}
}
if sub_ck.found_limit.is_none() || sub_ck.hash.is_none() {
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
return Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: None,
time,
});
}
let found_limit = sub_ck.found_limit.unwrap();
let actual_limit_hash = blake2sum(&found_limit[..]);
if actual_limit_hash.as_slice()[0..range.level]
.iter()
.all(|x| *x == 0u8)
{
trace!(
"range_checksum {:?} returning {} items",
range,
children.len()
);
return Ok(RangeChecksum {
bounds: range.clone(),
children,
found_limit: Some(found_limit.clone()),
time,
});
}
sub_range.begin = found_limit;
}
trace!("range_checksum {:?} exiting due to must_exit", range);
Err(Error::Message(format!("Exiting.")))
}
}
fn range_checksum_cached_hash(
self: &Arc<Self>,
range: &SyncRange,
must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksumCache, Error> {
{
let mut cache = self.cache[range.level].lock().unwrap();
if let Some(v) = cache.get(&range) {
if Instant::now() - v.time < CHECKSUM_CACHE_TIMEOUT {
return Ok(v.clone());
}
}
cache.remove(&range);
}
let v = self.range_checksum(&range, must_exit)?;
trace!(
"({}) New checksum calculated for {}-{}/{}, {} children",
self.table.name,
hex::encode(&range.begin)
.chars()
.take(16)
.collect::<String>(),
hex::encode(&range.end).chars().take(16).collect::<String>(),
range.level,
v.children.len()
);
let hash = if v.children.len() > 0 {
Some(blake2sum(&rmp_to_vec_all_named(&v)?[..]))
} else {
None
};
let cache_entry = RangeChecksumCache {
hash,
found_limit: v.found_limit,
time: v.time,
};
let mut cache = self.cache[range.level].lock().unwrap();
cache.insert(range.clone(), cache_entry.clone());
Ok(cache_entry)
}
async fn do_sync_with(
self: Arc<Self>,
partition: TodoPartition,
root_ck: RangeChecksum,
who: UUID,
mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> {
let mut todo = VecDeque::new();
// If their root checksum has level > than us, use that as a reference
let root_cks_resp = self
.table
.rpc_client
.call(
who,
TableRPC::<F>::SyncRPC(SyncRPC::GetRootChecksumRange(
partition.begin.clone(),
partition.end.clone(),
)),
TABLE_SYNC_RPC_TIMEOUT,
)
.await?;
if let TableRPC::<F>::SyncRPC(SyncRPC::RootChecksumRange(range)) = root_cks_resp {
if range.level > root_ck.bounds.level {
let their_root_range_ck = self.range_checksum(&range, &mut must_exit)?;
todo.push_back(their_root_range_ck);
} else {
todo.push_back(root_ck);
}
} else {
return Err(Error::Message(format!(
"Invalid respone to GetRootChecksumRange RPC: {}",
debug_serialize(root_cks_resp)
)));
}
while !todo.is_empty() && !*must_exit.borrow() {
let total_children = todo.iter().map(|x| x.children.len()).fold(0, |x, y| x + y);
trace!(
"({}) Sync with {:?}: {} ({}) remaining",
self.table.name,
who,
todo.len(),
total_children
);
let step_size = std::cmp::min(16, todo.len());
let step = todo.drain(..step_size).collect::<Vec<_>>();
let rpc_resp = self
.table
.rpc_client
.call(
who,
TableRPC::<F>::SyncRPC(SyncRPC::Checksums(step)),
TABLE_SYNC_RPC_TIMEOUT,
)
.await?;
if let TableRPC::<F>::SyncRPC(SyncRPC::Difference(mut diff_ranges, diff_items)) =
rpc_resp
{
if diff_ranges.len() > 0 || diff_items.len() > 0 {
info!(
"({}) Sync with {:?}: difference {} ranges, {} items",
self.table.name,
who,
diff_ranges.len(),
diff_items.len()
);
}
let mut items_to_send = vec![];
for differing in diff_ranges.drain(..) {
if differing.level == 0 {
items_to_send.push(differing.begin);
} else {
let checksum = self.range_checksum(&differing, &mut must_exit)?;
todo.push_back(checksum);
}
}
if diff_items.len() > 0 {
self.table.handle_update(&diff_items[..]).await?;
}
if items_to_send.len() > 0 {
self.send_items(who, items_to_send).await?;
}
} else {
return Err(Error::Message(format!(
"Unexpected response to sync RPC checksums: {}",
debug_serialize(&rpc_resp)
)));
}
}
Ok(())
}
async fn send_items(&self, who: UUID, item_list: Vec<Vec<u8>>) -> Result<(), Error> {
info!(
"({}) Sending {} items to {:?}",
self.table.name,
item_list.len(),
who
);
let mut values = vec![];
for item in item_list.iter() {
if let Some(v) = self.table.store.get(&item[..])? {
values.push(Arc::new(ByteBuf::from(v.as_ref())));
}
}
let rpc_resp = self
.table
.rpc_client
.call(who, TableRPC::<F>::Update(values), TABLE_SYNC_RPC_TIMEOUT)
.await?;
if let TableRPC::<F>::Ok = rpc_resp {
Ok(())
} else {
Err(Error::Message(format!(
"Unexpected response to RPC Update: {}",
debug_serialize(&rpc_resp)
)))
}
}
pub(crate) async fn handle_rpc(
self: &Arc<Self>,
message: &SyncRPC,
mut must_exit: watch::Receiver<bool>,
) -> Result<SyncRPC, Error> {
match message {
SyncRPC::GetRootChecksumRange(begin, end) => {
let root_cks = self.root_checksum(&begin, &end, &mut must_exit)?;
Ok(SyncRPC::RootChecksumRange(root_cks.bounds))
}
SyncRPC::Checksums(checksums) => {
self.handle_checksums_rpc(&checksums[..], &mut must_exit)
.await
}
_ => Err(Error::Message(format!("Unexpected sync RPC"))),
}
}
async fn handle_checksums_rpc(
self: &Arc<Self>,
checksums: &[RangeChecksum],
must_exit: &mut watch::Receiver<bool>,
) -> Result<SyncRPC, Error> {
let mut ret_ranges = vec![];
let mut ret_items = vec![];
for their_ckr in checksums.iter() {
let our_ckr = self.range_checksum(&their_ckr.bounds, must_exit)?;
for (their_range, their_hash) in their_ckr.children.iter() {
let differs = match our_ckr
.children
.binary_search_by(|(our_range, _)| our_range.cmp(&their_range))
{
Err(_) => {
if their_range.level >= 1 {
let cached_hash =
self.range_checksum_cached_hash(&their_range, must_exit)?;
cached_hash.hash.map(|h| h != *their_hash).unwrap_or(true)
} else {
true
}
}
Ok(i) => our_ckr.children[i].1 != *their_hash,
};
if differs {
ret_ranges.push(their_range.clone());
if their_range.level == 0 {
if let Some(item_bytes) =
self.table.store.get(their_range.begin.as_slice())?
{
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
}
}
}
}
for (our_range, _hash) in our_ckr.children.iter() {
if let Some(their_found_limit) = &their_ckr.found_limit {
if our_range.begin.as_slice() > their_found_limit.as_slice() {
break;
}
}
let not_present = our_ckr
.children
.binary_search_by(|(their_range, _)| their_range.cmp(&our_range))
.is_err();
if not_present {
if our_range.level > 0 {
ret_ranges.push(our_range.clone());
}
if our_range.level == 0 {
if let Some(item_bytes) =
self.table.store.get(our_range.begin.as_slice())?
{
ret_items.push(Arc::new(ByteBuf::from(item_bytes.to_vec())));
}
}
}
}
}
let n_checksums = checksums
.iter()
.map(|x| x.children.len())
.fold(0, |x, y| x + y);
if ret_ranges.len() > 0 || ret_items.len() > 0 {
trace!(
"({}) Checksum comparison RPC: {} different + {} items for {} received",
self.table.name,
ret_ranges.len(),
ret_items.len(),
n_checksums
);
}
Ok(SyncRPC::Difference(ret_ranges, ret_items))
}
pub(crate) fn invalidate(self: &Arc<Self>, item_key: &[u8]) {
for i in 1..MAX_DEPTH {
let needle = SyncRange {
begin: item_key.to_vec(),
end: vec![],
level: i,
};
let mut cache = self.cache[i].lock().unwrap();
if let Some(cache_entry) = cache.range(..=needle).rev().next() {
if cache_entry.0.begin[..] <= *item_key && cache_entry.0.end[..] > *item_key {
let index = cache_entry.0.clone();
drop(cache_entry);
cache.remove(&index);
}
}
}
}
}
impl SyncTodo {
fn add_full_scan<F: TableSchema, R: TableReplication>(&mut self, table: &Table<F, R>) {
let my_id = table.system.id;
self.todo.clear();
let ring = table.system.ring.borrow().clone();
let split_points = table.replication.split_points(&ring);
for i in 0..split_points.len() - 1 {
let begin = split_points[i];
let end = split_points[i + 1];
let nodes = table.replication.replication_nodes(&begin, &ring);
let retain = nodes.contains(&my_id);
if !retain {
// Check if we have some data to send, otherwise skip
if table.store.range(begin..end).next().is_none() {
continue;
}
}
self.todo.push(TodoPartition { begin, end, retain });
}
}
fn add_ring_difference<F: TableSchema, R: TableReplication>(
&mut self,
table: &Table<F, R>,
old_ring: &Ring,
new_ring: &Ring,
) {
let my_id = table.system.id;
// If it is us who are entering or leaving the system,
// initiate a full sync instead of incremental sync
if old_ring.config.members.contains_key(&my_id)
!= new_ring.config.members.contains_key(&my_id)
{
self.add_full_scan(table);
return;
}
let mut all_points = None
.into_iter()
.chain(table.replication.split_points(old_ring).drain(..))
.chain(table.replication.split_points(new_ring).drain(..))
.chain(self.todo.iter().map(|x| x.begin))
.chain(self.todo.iter().map(|x| x.end))
.collect::<Vec<_>>();
all_points.sort();
all_points.dedup();
let mut old_todo = std::mem::replace(&mut self.todo, vec![]);
old_todo.sort_by(|x, y| x.begin.cmp(&y.begin));
let mut new_todo = vec![];
for i in 0..all_points.len() - 1 {
let begin = all_points[i];
let end = all_points[i + 1];
let was_ours = table
.replication
.replication_nodes(&begin, &old_ring)
.contains(&my_id);
let is_ours = table
.replication
.replication_nodes(&begin, &new_ring)
.contains(&my_id);
let was_todo = match old_todo.binary_search_by(|x| x.begin.cmp(&begin)) {
Ok(_) => true,
Err(j) => {
(j > 0 && old_todo[j - 1].begin < end && begin < old_todo[j - 1].end)
|| (j < old_todo.len()
&& old_todo[j].begin < end && begin < old_todo[j].end)
}
};
if was_todo || (is_ours && !was_ours) || (was_ours && !is_ours) {
new_todo.push(TodoPartition {
begin,
end,
retain: is_ours,
});
}
}
self.todo = new_todo;
}
fn pop_task(&mut self) -> Option<TodoPartition> {
if self.todo.is_empty() {
return None;
}
let i = rand::thread_rng().gen_range::<usize, _, _>(0, self.todo.len());
if i == self.todo.len() - 1 {
self.todo.pop()
} else {
let replacement = self.todo.pop().unwrap();
let ret = std::mem::replace(&mut self.todo[i], replacement);
Some(ret)
}
}
}

View file

@ -13,29 +13,26 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
rand = "0.7"
hex = "0.3"
sha2 = "0.8"
rand = "0.8"
hex = "0.4"
sha2 = "0.9"
blake2 = "0.9"
err-derive = "0.2.3"
err-derive = "0.3"
log = "0.4"
fasthash = "0.4"
sled = "0.34"
toml = "0.5"
rmp-serde = "0.14.3"
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_json = "1.0"
chrono = "0.4"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "0.2", default-features = false, features = ["rt-core", "rt-threaded", "io-driver", "net", "tcp", "time", "macros", "sync", "signal", "fs"] }
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
http = "0.2"
hyper = "0.13"
rustls = "0.17"
hyper = "0.14"
rustls = "0.19"
webpki = "0.21"
roxmltree = "0.11"

View file

@ -1,12 +1,11 @@
use core::future::Future;
use std::pin::Pin;
use futures::future::join_all;
use futures::select;
use futures_util::future::*;
use std::sync::Arc;
use tokio::sync::Mutex;
use tokio::sync::{mpsc, watch, Notify};
use std::time::Duration;
use futures::future::*;
use futures::select;
use tokio::sync::{mpsc, watch, Mutex};
use crate::error::Error;
@ -14,54 +13,106 @@ type JobOutput = Result<(), Error>;
type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
pub struct BackgroundRunner {
n_runners: usize,
pub stop_signal: watch::Receiver<bool>,
queue_in: mpsc::UnboundedSender<(Job, bool)>,
queue_out: Mutex<mpsc::UnboundedReceiver<(Job, bool)>>,
job_notify: Notify,
workers: Mutex<Vec<tokio::task::JoinHandle<()>>>,
worker_in: mpsc::UnboundedSender<tokio::task::JoinHandle<()>>,
}
impl BackgroundRunner {
pub fn new(n_runners: usize, stop_signal: watch::Receiver<bool>) -> Arc<Self> {
pub fn new(
n_runners: usize,
stop_signal: watch::Receiver<bool>,
) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
let (worker_in, mut worker_out) = mpsc::unbounded_channel();
let stop_signal_2 = stop_signal.clone();
let await_all_done = tokio::spawn(async move {
loop {
let wkr = {
select! {
item = worker_out.recv().fuse() => {
match item {
Some(x) => x,
None => break,
}
}
_ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
if *stop_signal_2.borrow() {
break;
} else {
continue;
}
}
}
};
if let Err(e) = wkr.await {
error!("Error while awaiting for worker: {}", e);
}
}
});
let (queue_in, queue_out) = mpsc::unbounded_channel();
Arc::new(Self {
n_runners,
let queue_out = Arc::new(Mutex::new(queue_out));
for i in 0..n_runners {
let queue_out = queue_out.clone();
let stop_signal = stop_signal.clone();
worker_in
.send(tokio::spawn(async move {
loop {
let (job, cancellable) = {
select! {
item = wait_job(&queue_out).fuse() => match item {
// We received a task, process it
Some(x) => x,
// We received a signal that no more tasks will ever be sent
// because the sending side was dropped. Exit now.
None => break,
},
_ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
if *stop_signal.borrow() {
// Nothing has been going on for 5 secs, and we are shutting
// down. Exit now.
break;
} else {
// Nothing is going on but we don't want to exit.
continue;
}
}
}
};
if cancellable && *stop_signal.borrow() {
continue;
}
if let Err(e) = job.await {
error!("Job failed: {}", e)
}
}
info!("Background worker {} exiting", i);
}))
.unwrap();
}
let bgrunner = Arc::new(Self {
stop_signal,
queue_in,
queue_out: Mutex::new(queue_out),
job_notify: Notify::new(),
workers: Mutex::new(Vec::new()),
})
}
pub async fn run(self: Arc<Self>) {
let mut workers = self.workers.lock().await;
for i in 0..self.n_runners {
workers.push(tokio::spawn(self.clone().runner(i)));
}
drop(workers);
let mut stop_signal = self.stop_signal.clone();
while let Some(exit_now) = stop_signal.recv().await {
if exit_now {
let mut workers = self.workers.lock().await;
let workers_vec = workers.drain(..).collect::<Vec<_>>();
join_all(workers_vec).await;
return;
}
}
worker_in,
});
(bgrunner, await_all_done)
}
// Spawn a task to be run in background
pub fn spawn<T>(&self, job: T)
where
T: Future<Output = JobOutput> + Send + 'static,
{
let boxed: Job = Box::pin(job);
let _: Result<_, _> = self.queue_in.clone().send((boxed, false));
self.job_notify.notify();
self.queue_in
.send((boxed, false))
.map_err(|_| "could not put job in queue")
.unwrap();
}
pub fn spawn_cancellable<T>(&self, job: T)
@ -69,56 +120,30 @@ impl BackgroundRunner {
T: Future<Output = JobOutput> + Send + 'static,
{
let boxed: Job = Box::pin(job);
let _: Result<_, _> = self.queue_in.clone().send((boxed, true));
self.job_notify.notify();
self.queue_in
.send((boxed, true))
.map_err(|_| "could not put job in queue")
.unwrap();
}
pub async fn spawn_worker<F, T>(&self, name: String, worker: F)
pub fn spawn_worker<F, T>(&self, name: String, worker: F)
where
F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
T: Future<Output = JobOutput> + Send + 'static,
T: Future<Output = ()> + Send + 'static,
{
let mut workers = self.workers.lock().await;
let stop_signal = self.stop_signal.clone();
workers.push(tokio::spawn(async move {
if let Err(e) = worker(stop_signal).await {
error!("Worker stopped with error: {}, error: {}", name, e);
} else {
info!("Worker exited successfully: {}", name);
}
}));
}
async fn runner(self: Arc<Self>, i: usize) {
let mut stop_signal = self.stop_signal.clone();
loop {
let must_exit: bool = *stop_signal.borrow();
if let Some(job) = self.dequeue_job(must_exit).await {
if let Err(e) = job.await {
error!("Job failed: {}", e)
}
} else {
if must_exit {
info!("Background runner {} exiting", i);
return;
}
select! {
_ = self.job_notify.notified().fuse() => (),
_ = stop_signal.recv().fuse() => (),
}
}
}
}
async fn dequeue_job(&self, must_exit: bool) -> Option<Job> {
let mut queue = self.queue_out.lock().await;
while let Ok((job, cancellable)) = queue.try_recv() {
if cancellable && must_exit {
continue;
} else {
return Some(job);
}
}
None
let task = tokio::spawn(async move {
info!("Worker started: {}", name);
worker(stop_signal).await;
info!("Worker exited: {}", name);
});
self.worker_in
.send(task)
.map_err(|_| "could not put job in queue")
.unwrap();
}
}
async fn wait_job(q: &Mutex<mpsc::UnboundedReceiver<(Job, bool)>>) -> Option<(Job, bool)> {
q.lock().await.recv().await
}

View file

@ -2,7 +2,6 @@ use rand::Rng;
use serde::de::{self, Visitor};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::fmt;
use std::time::{SystemTime, UNIX_EPOCH};
#[derive(Default, PartialOrd, Ord, Clone, Hash, PartialEq, Copy)]
pub struct FixedBytes32([u8; 32]);
@ -71,6 +70,14 @@ impl FixedBytes32 {
pub fn to_vec(&self) -> Vec<u8> {
self.0.to_vec()
}
pub fn try_from(by: &[u8]) -> Option<Self> {
if by.len() != 32 {
return None;
}
let mut ret = [0u8; 32];
ret.copy_from_slice(by);
Some(Self(ret))
}
}
pub type UUID = FixedBytes32;
@ -80,9 +87,9 @@ pub fn sha256sum(data: &[u8]) -> Hash {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.input(data);
hasher.update(data);
let mut hash = [0u8; 32];
hash.copy_from_slice(&hasher.result()[..]);
hash.copy_from_slice(&hasher.finalize()[..]);
hash.into()
}
@ -111,13 +118,6 @@ pub fn gen_uuid() -> UUID {
rand::thread_rng().gen::<[u8; 32]>().into()
}
pub fn now_msec() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Fix your clock :o")
.as_millis() as u64
}
// RMP serialization with names of fields and variants
pub fn rmp_to_vec_all_named<T>(val: &T) -> Result<Vec<u8>, rmp_serde::encode::Error>

View file

@ -8,16 +8,22 @@ use crate::data::*;
pub enum RPCError {
#[error(display = "Node is down: {:?}.", _0)]
NodeDown(UUID),
#[error(display = "Timeout: {}", _0)]
Timeout(#[error(source)] tokio::time::Elapsed),
Timeout(#[error(source)] tokio::time::error::Elapsed),
#[error(display = "HTTP error: {}", _0)]
HTTP(#[error(source)] http::Error),
#[error(display = "Hyper error: {}", _0)]
Hyper(#[error(source)] hyper::Error),
#[error(display = "Messagepack encode error: {}", _0)]
RMPEncode(#[error(source)] rmp_serde::encode::Error),
#[error(display = "Messagepack decode error: {}", _0)]
RMPDecode(#[error(source)] rmp_serde::decode::Error),
#[error(display = "Too many errors: {:?}", _0)]
TooManyErrors(Vec<String>),
}

View file

@ -5,3 +5,4 @@ pub mod background;
pub mod config;
pub mod data;
pub mod error;
pub mod time;

16
src/util/time.rs Normal file
View file

@ -0,0 +1,16 @@
use chrono::{SecondsFormat, TimeZone, Utc};
use std::time::{SystemTime, UNIX_EPOCH};
pub fn now_msec() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Fix your clock :o")
.as_millis() as u64
}
pub fn msec_to_rfc3339(msecs: u64) -> String {
let secs = msecs as i64 / 1000;
let nanos = (msecs as i64 % 1000) as u32 * 1_000_000;
let timestamp = Utc.timestamp(secs, nanos);
timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
}

View file

@ -18,11 +18,10 @@ garage_table = { version = "0.1.1", path = "../table" }
garage_model = { version = "0.1.1", path = "../model" }
garage_api = { version = "0.1.1", path = "../api" }
err-derive = "0.2.3"
err-derive = "0.3"
log = "0.4"
futures = "0.3"
http = "0.2"
hyper = "0.13"
hyper = "0.14"
percent-encoding = "2.1.0"
roxmltree = "0.11"
idna = "0.2"