Merge pull request 'read/write quorums on admin operations' (#997) from admin-quorums into next-v2
Reviewed-on: #997
This commit is contained in:
commit
c6d6cc1fc3
31 changed files with 385 additions and 281 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1404,7 +1404,6 @@ dependencies = [
|
||||||
"garage_db",
|
"garage_db",
|
||||||
"garage_net",
|
"garage_net",
|
||||||
"garage_rpc",
|
"garage_rpc",
|
||||||
"garage_table",
|
|
||||||
"garage_util",
|
"garage_util",
|
||||||
"hex",
|
"hex",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
|
|
|
@ -169,8 +169,7 @@ impl AdminApiServer {
|
||||||
};
|
};
|
||||||
|
|
||||||
if token_required {
|
if token_required {
|
||||||
verify_authorization(&self.garage, global_token_hash, auth_header, request.name())
|
verify_authorization(&self.garage, global_token_hash, auth_header, request.name())?;
|
||||||
.await?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
match request {
|
match request {
|
||||||
|
@ -245,7 +244,7 @@ fn hash_bearer_token(token: &str) -> String {
|
||||||
.to_string()
|
.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn verify_authorization(
|
fn verify_authorization(
|
||||||
garage: &Garage,
|
garage: &Garage,
|
||||||
global_token_hash: Option<&str>,
|
global_token_hash: Option<&str>,
|
||||||
auth_header: Option<hyper::http::HeaderValue>,
|
auth_header: Option<hyper::http::HeaderValue>,
|
||||||
|
@ -271,8 +270,7 @@ async fn verify_authorization(
|
||||||
let token_hash_string = if let Some((prefix, _)) = token.split_once('.') {
|
let token_hash_string = if let Some((prefix, _)) = token.split_once('.') {
|
||||||
garage
|
garage
|
||||||
.admin_token_table
|
.admin_token_table
|
||||||
.get(&EmptyKey, &prefix.to_string())
|
.get_local(&EmptyKey, &prefix.to_string())?
|
||||||
.await?
|
|
||||||
.and_then(|k| k.state.into_option())
|
.and_then(|k| k.state.into_option())
|
||||||
.filter(|p| {
|
.filter(|p| {
|
||||||
p.expiration
|
p.expiration
|
||||||
|
|
|
@ -79,18 +79,24 @@ impl RequestHandler for GetBucketInfoRequest {
|
||||||
garage: &Arc<Garage>,
|
garage: &Arc<Garage>,
|
||||||
_admin: &Admin,
|
_admin: &Admin,
|
||||||
) -> Result<GetBucketInfoResponse, Error> {
|
) -> Result<GetBucketInfoResponse, Error> {
|
||||||
let bucket_id = match (self.id, self.global_alias, self.search) {
|
let bucket = match (self.id, self.global_alias, self.search) {
|
||||||
(Some(id), None, None) => parse_bucket_id(&id)?,
|
(Some(id), None, None) => {
|
||||||
(None, Some(ga), None) => garage
|
let id = parse_bucket_id(&id)?;
|
||||||
.bucket_alias_table
|
garage.bucket_helper().get_existing_bucket(id).await?
|
||||||
.get(&EmptyKey, &ga)
|
}
|
||||||
.await?
|
(None, Some(ga), None) => {
|
||||||
.and_then(|x| *x.state.get())
|
let id = garage
|
||||||
.ok_or_else(|| HelperError::NoSuchBucket(ga.to_string()))?,
|
.bucket_alias_table
|
||||||
|
.get(&EmptyKey, &ga)
|
||||||
|
.await?
|
||||||
|
.and_then(|x| *x.state.get())
|
||||||
|
.ok_or_else(|| HelperError::NoSuchBucket(ga.to_string()))?;
|
||||||
|
garage.bucket_helper().get_existing_bucket(id).await?
|
||||||
|
}
|
||||||
(None, None, Some(search)) => {
|
(None, None, Some(search)) => {
|
||||||
let helper = garage.bucket_helper();
|
let helper = garage.bucket_helper();
|
||||||
if let Some(uuid) = helper.resolve_global_bucket_name(&search).await? {
|
if let Some(bucket) = helper.resolve_global_bucket(&search).await? {
|
||||||
uuid
|
bucket
|
||||||
} else {
|
} else {
|
||||||
let hexdec = if search.len() >= 2 {
|
let hexdec = if search.len() >= 2 {
|
||||||
search
|
search
|
||||||
|
@ -124,7 +130,7 @@ impl RequestHandler for GetBucketInfoRequest {
|
||||||
if candidates.is_empty() {
|
if candidates.is_empty() {
|
||||||
return Err(Error::Common(CommonError::NoSuchBucket(search.clone())));
|
return Err(Error::Common(CommonError::NoSuchBucket(search.clone())));
|
||||||
} else if candidates.len() == 1 {
|
} else if candidates.len() == 1 {
|
||||||
candidates.into_iter().next().unwrap().id
|
candidates.into_iter().next().unwrap()
|
||||||
} else {
|
} else {
|
||||||
return Err(Error::bad_request(format!(
|
return Err(Error::bad_request(format!(
|
||||||
"Several matching buckets: {}",
|
"Several matching buckets: {}",
|
||||||
|
@ -140,23 +146,18 @@ impl RequestHandler for GetBucketInfoRequest {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
bucket_info_results(garage, bucket_id).await
|
bucket_info_results(garage, bucket).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn bucket_info_results(
|
async fn bucket_info_results(
|
||||||
garage: &Arc<Garage>,
|
garage: &Arc<Garage>,
|
||||||
bucket_id: Uuid,
|
bucket: Bucket,
|
||||||
) -> Result<GetBucketInfoResponse, Error> {
|
) -> Result<GetBucketInfoResponse, Error> {
|
||||||
let bucket = garage
|
|
||||||
.bucket_helper()
|
|
||||||
.get_existing_bucket(bucket_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let counters = garage
|
let counters = garage
|
||||||
.object_counter_table
|
.object_counter_table
|
||||||
.table
|
.table
|
||||||
.get(&bucket_id, &EmptyKey)
|
.get(&bucket.id, &EmptyKey)
|
||||||
.await?
|
.await?
|
||||||
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
@ -164,7 +165,7 @@ async fn bucket_info_results(
|
||||||
let mpu_counters = garage
|
let mpu_counters = garage
|
||||||
.mpu_counter_table
|
.mpu_counter_table
|
||||||
.table
|
.table
|
||||||
.get(&bucket_id, &EmptyKey)
|
.get(&bucket.id, &EmptyKey)
|
||||||
.await?
|
.await?
|
||||||
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
.map(|x| x.filtered_values(&garage.system.cluster_layout()))
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
@ -336,7 +337,7 @@ impl RequestHandler for CreateBucketRequest {
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(CreateBucketResponse(
|
Ok(CreateBucketResponse(
|
||||||
bucket_info_results(garage, bucket.id).await?,
|
bucket_info_results(garage, bucket).await?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -444,7 +445,7 @@ impl RequestHandler for UpdateBucketRequest {
|
||||||
garage.bucket_table.insert(&bucket).await?;
|
garage.bucket_table.insert(&bucket).await?;
|
||||||
|
|
||||||
Ok(UpdateBucketResponse(
|
Ok(UpdateBucketResponse(
|
||||||
bucket_info_results(garage, bucket_id).await?,
|
bucket_info_results(garage, bucket).await?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -534,7 +535,7 @@ pub async fn handle_bucket_change_key_perm(
|
||||||
.set_bucket_key_permissions(bucket.id, &key.key_id, perm)
|
.set_bucket_key_permissions(bucket.id, &key.key_id, perm)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
bucket_info_results(garage, bucket.id).await
|
bucket_info_results(garage, bucket).await
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- BUCKET ALIASES ----
|
// ---- BUCKET ALIASES ----
|
||||||
|
@ -551,11 +552,11 @@ impl RequestHandler for AddBucketAliasRequest {
|
||||||
|
|
||||||
let helper = garage.locked_helper().await;
|
let helper = garage.locked_helper().await;
|
||||||
|
|
||||||
match self.alias {
|
let bucket = match self.alias {
|
||||||
BucketAliasEnum::Global { global_alias } => {
|
BucketAliasEnum::Global { global_alias } => {
|
||||||
helper
|
helper
|
||||||
.set_global_bucket_alias(bucket_id, &global_alias)
|
.set_global_bucket_alias(bucket_id, &global_alias)
|
||||||
.await?;
|
.await?
|
||||||
}
|
}
|
||||||
BucketAliasEnum::Local {
|
BucketAliasEnum::Local {
|
||||||
local_alias,
|
local_alias,
|
||||||
|
@ -563,12 +564,12 @@ impl RequestHandler for AddBucketAliasRequest {
|
||||||
} => {
|
} => {
|
||||||
helper
|
helper
|
||||||
.set_local_bucket_alias(bucket_id, &access_key_id, &local_alias)
|
.set_local_bucket_alias(bucket_id, &access_key_id, &local_alias)
|
||||||
.await?;
|
.await?
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
Ok(AddBucketAliasResponse(
|
Ok(AddBucketAliasResponse(
|
||||||
bucket_info_results(garage, bucket_id).await?,
|
bucket_info_results(garage, bucket).await?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -585,11 +586,11 @@ impl RequestHandler for RemoveBucketAliasRequest {
|
||||||
|
|
||||||
let helper = garage.locked_helper().await;
|
let helper = garage.locked_helper().await;
|
||||||
|
|
||||||
match self.alias {
|
let bucket = match self.alias {
|
||||||
BucketAliasEnum::Global { global_alias } => {
|
BucketAliasEnum::Global { global_alias } => {
|
||||||
helper
|
helper
|
||||||
.unset_global_bucket_alias(bucket_id, &global_alias)
|
.unset_global_bucket_alias(bucket_id, &global_alias)
|
||||||
.await?;
|
.await?
|
||||||
}
|
}
|
||||||
BucketAliasEnum::Local {
|
BucketAliasEnum::Local {
|
||||||
local_alias,
|
local_alias,
|
||||||
|
@ -597,12 +598,12 @@ impl RequestHandler for RemoveBucketAliasRequest {
|
||||||
} => {
|
} => {
|
||||||
helper
|
helper
|
||||||
.unset_local_bucket_alias(bucket_id, &access_key_id, &local_alias)
|
.unset_local_bucket_alias(bucket_id, &access_key_id, &local_alias)
|
||||||
.await?;
|
.await?
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
Ok(RemoveBucketAliasResponse(
|
Ok(RemoveBucketAliasResponse(
|
||||||
bucket_info_results(garage, bucket_id).await?,
|
bucket_info_results(garage, bucket).await?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -151,12 +151,11 @@ async fn check_domain(garage: &Arc<Garage>, domain: &str) -> Result<bool, Error>
|
||||||
(domain.to_string(), true)
|
(domain.to_string(), true)
|
||||||
};
|
};
|
||||||
|
|
||||||
let bucket_id = match garage
|
let bucket = match garage
|
||||||
.bucket_helper()
|
.bucket_helper()
|
||||||
.resolve_global_bucket_name(&bucket_name)
|
.resolve_global_bucket_fast(&bucket_name)?
|
||||||
.await?
|
|
||||||
{
|
{
|
||||||
Some(bucket_id) => bucket_id,
|
Some(b) => b,
|
||||||
None => return Ok(false),
|
None => return Ok(false),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -164,11 +163,6 @@ async fn check_domain(garage: &Arc<Garage>, domain: &str) -> Result<bool, Error>
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
let bucket = garage
|
|
||||||
.bucket_helper()
|
|
||||||
.get_existing_bucket(bucket_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let bucket_state = bucket.state.as_option().unwrap();
|
let bucket_state = bucket.state.as_option().unwrap();
|
||||||
let bucket_website_config = bucket_state.website_config.get();
|
let bucket_website_config = bucket_state.website_config.get();
|
||||||
|
|
||||||
|
|
|
@ -9,9 +9,7 @@ use hyper::{body::Body, body::Incoming as IncomingBody, Request, Response, Statu
|
||||||
use garage_model::bucket_table::{BucketParams, CorsRule as GarageCorsRule};
|
use garage_model::bucket_table::{BucketParams, CorsRule as GarageCorsRule};
|
||||||
use garage_model::garage::Garage;
|
use garage_model::garage::Garage;
|
||||||
|
|
||||||
use crate::common_error::{
|
use crate::common_error::{CommonError, OkOrBadRequest, OkOrInternalError};
|
||||||
helper_error_as_internal, CommonError, OkOrBadRequest, OkOrInternalError,
|
|
||||||
};
|
|
||||||
use crate::helpers::*;
|
use crate::helpers::*;
|
||||||
|
|
||||||
pub fn find_matching_cors_rule<'a, B>(
|
pub fn find_matching_cors_rule<'a, B>(
|
||||||
|
@ -76,7 +74,7 @@ pub fn add_cors_headers(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn handle_options_api(
|
pub fn handle_options_api(
|
||||||
garage: Arc<Garage>,
|
garage: Arc<Garage>,
|
||||||
req: &Request<IncomingBody>,
|
req: &Request<IncomingBody>,
|
||||||
bucket_name: Option<String>,
|
bucket_name: Option<String>,
|
||||||
|
@ -93,16 +91,8 @@ pub async fn handle_options_api(
|
||||||
// OPTIONS calls are not auhtenticated).
|
// OPTIONS calls are not auhtenticated).
|
||||||
if let Some(bn) = bucket_name {
|
if let Some(bn) = bucket_name {
|
||||||
let helper = garage.bucket_helper();
|
let helper = garage.bucket_helper();
|
||||||
let bucket_id = helper
|
let bucket_opt = helper.resolve_global_bucket_fast(&bn)?;
|
||||||
.resolve_global_bucket_name(&bn)
|
if let Some(bucket) = bucket_opt {
|
||||||
.await
|
|
||||||
.map_err(helper_error_as_internal)?;
|
|
||||||
if let Some(id) = bucket_id {
|
|
||||||
let bucket = garage
|
|
||||||
.bucket_helper()
|
|
||||||
.get_existing_bucket(id)
|
|
||||||
.await
|
|
||||||
.map_err(helper_error_as_internal)?;
|
|
||||||
let bucket_params = bucket.state.into_option().unwrap();
|
let bucket_params = bucket.state.into_option().unwrap();
|
||||||
handle_options_for_bucket(req, &bucket_params)
|
handle_options_for_bucket(req, &bucket_params)
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -64,12 +64,12 @@ pub struct VerifiedRequest {
|
||||||
pub content_sha256_header: ContentSha256Header,
|
pub content_sha256_header: ContentSha256Header,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn verify_request(
|
pub fn verify_request(
|
||||||
garage: &Garage,
|
garage: &Garage,
|
||||||
mut req: Request<IncomingBody>,
|
mut req: Request<IncomingBody>,
|
||||||
service: &'static str,
|
service: &'static str,
|
||||||
) -> Result<VerifiedRequest, Error> {
|
) -> Result<VerifiedRequest, Error> {
|
||||||
let checked_signature = payload::check_payload_signature(&garage, &mut req, service).await?;
|
let checked_signature = payload::check_payload_signature(&garage, &mut req, service)?;
|
||||||
|
|
||||||
let request = streaming::parse_streaming_body(
|
let request = streaming::parse_streaming_body(
|
||||||
req,
|
req,
|
||||||
|
|
|
@ -32,7 +32,7 @@ pub struct CheckedSignature {
|
||||||
pub signature_header: Option<String>,
|
pub signature_header: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn check_payload_signature(
|
pub fn check_payload_signature(
|
||||||
garage: &Garage,
|
garage: &Garage,
|
||||||
request: &mut Request<IncomingBody>,
|
request: &mut Request<IncomingBody>,
|
||||||
service: &'static str,
|
service: &'static str,
|
||||||
|
@ -43,9 +43,9 @@ pub async fn check_payload_signature(
|
||||||
// We check for presigned-URL-style authentication first, because
|
// We check for presigned-URL-style authentication first, because
|
||||||
// the browser or something else could inject an Authorization header
|
// the browser or something else could inject an Authorization header
|
||||||
// that is totally unrelated to AWS signatures.
|
// that is totally unrelated to AWS signatures.
|
||||||
check_presigned_signature(garage, service, request, query).await
|
check_presigned_signature(garage, service, request, query)
|
||||||
} else if request.headers().contains_key(AUTHORIZATION) {
|
} else if request.headers().contains_key(AUTHORIZATION) {
|
||||||
check_standard_signature(garage, service, request, query).await
|
check_standard_signature(garage, service, request, query)
|
||||||
} else {
|
} else {
|
||||||
// Unsigned (anonymous) request
|
// Unsigned (anonymous) request
|
||||||
let content_sha256 = request
|
let content_sha256 = request
|
||||||
|
@ -93,7 +93,7 @@ fn parse_x_amz_content_sha256(header: Option<&str>) -> Result<ContentSha256Heade
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_standard_signature(
|
fn check_standard_signature(
|
||||||
garage: &Garage,
|
garage: &Garage,
|
||||||
service: &'static str,
|
service: &'static str,
|
||||||
request: &Request<IncomingBody>,
|
request: &Request<IncomingBody>,
|
||||||
|
@ -128,7 +128,7 @@ async fn check_standard_signature(
|
||||||
trace!("canonical request:\n{}", canonical_request);
|
trace!("canonical request:\n{}", canonical_request);
|
||||||
trace!("string to sign:\n{}", string_to_sign);
|
trace!("string to sign:\n{}", string_to_sign);
|
||||||
|
|
||||||
let key = verify_v4(garage, service, &authorization, string_to_sign.as_bytes()).await?;
|
let key = verify_v4(garage, service, &authorization, string_to_sign.as_bytes())?;
|
||||||
|
|
||||||
let content_sha256_header = parse_x_amz_content_sha256(Some(&authorization.content_sha256))?;
|
let content_sha256_header = parse_x_amz_content_sha256(Some(&authorization.content_sha256))?;
|
||||||
|
|
||||||
|
@ -139,7 +139,7 @@ async fn check_standard_signature(
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_presigned_signature(
|
fn check_presigned_signature(
|
||||||
garage: &Garage,
|
garage: &Garage,
|
||||||
service: &'static str,
|
service: &'static str,
|
||||||
request: &mut Request<IncomingBody>,
|
request: &mut Request<IncomingBody>,
|
||||||
|
@ -178,7 +178,7 @@ async fn check_presigned_signature(
|
||||||
trace!("canonical request (presigned url):\n{}", canonical_request);
|
trace!("canonical request (presigned url):\n{}", canonical_request);
|
||||||
trace!("string to sign (presigned url):\n{}", string_to_sign);
|
trace!("string to sign (presigned url):\n{}", string_to_sign);
|
||||||
|
|
||||||
let key = verify_v4(garage, service, &authorization, string_to_sign.as_bytes()).await?;
|
let key = verify_v4(garage, service, &authorization, string_to_sign.as_bytes())?;
|
||||||
|
|
||||||
// In the page on presigned URLs, AWS specifies that if a signed query
|
// In the page on presigned URLs, AWS specifies that if a signed query
|
||||||
// parameter and a signed header of the same name have different values,
|
// parameter and a signed header of the same name have different values,
|
||||||
|
@ -378,7 +378,7 @@ pub fn parse_date(date: &str) -> Result<DateTime<Utc>, Error> {
|
||||||
Ok(Utc.from_utc_datetime(&date))
|
Ok(Utc.from_utc_datetime(&date))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn verify_v4(
|
pub fn verify_v4(
|
||||||
garage: &Garage,
|
garage: &Garage,
|
||||||
service: &str,
|
service: &str,
|
||||||
auth: &Authorization,
|
auth: &Authorization,
|
||||||
|
@ -391,8 +391,7 @@ pub async fn verify_v4(
|
||||||
|
|
||||||
let key = garage
|
let key = garage
|
||||||
.key_table
|
.key_table
|
||||||
.get(&EmptyKey, &auth.key_id)
|
.get_local(&EmptyKey, &auth.key_id)?
|
||||||
.await?
|
|
||||||
.filter(|k| !k.state.is_deleted())
|
.filter(|k| !k.state.is_deleted())
|
||||||
.ok_or_else(|| Error::forbidden(format!("No such key: {}", &auth.key_id)))?;
|
.ok_or_else(|| Error::forbidden(format!("No such key: {}", &auth.key_id)))?;
|
||||||
let key_p = key.params().unwrap();
|
let key_p = key.params().unwrap();
|
||||||
|
|
|
@ -77,25 +77,19 @@ impl ApiHandler for K2VApiServer {
|
||||||
// The OPTIONS method is processed early, before we even check for an API key
|
// The OPTIONS method is processed early, before we even check for an API key
|
||||||
if let Endpoint::Options = endpoint {
|
if let Endpoint::Options = endpoint {
|
||||||
let options_res = handle_options_api(garage, &req, Some(bucket_name))
|
let options_res = handle_options_api(garage, &req, Some(bucket_name))
|
||||||
.await
|
|
||||||
.ok_or_bad_request("Error handling OPTIONS")?;
|
.ok_or_bad_request("Error handling OPTIONS")?;
|
||||||
return Ok(options_res.map(|_empty_body: EmptyBody| empty_body()));
|
return Ok(options_res.map(|_empty_body: EmptyBody| empty_body()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let verified_request = verify_request(&garage, req, "k2v").await?;
|
let verified_request = verify_request(&garage, req, "k2v")?;
|
||||||
let req = verified_request.request;
|
let req = verified_request.request;
|
||||||
let api_key = verified_request.access_key;
|
let api_key = verified_request.access_key;
|
||||||
|
|
||||||
let bucket_id = garage
|
|
||||||
.bucket_helper()
|
|
||||||
.resolve_bucket(&bucket_name, &api_key)
|
|
||||||
.await
|
|
||||||
.map_err(pass_helper_error)?;
|
|
||||||
let bucket = garage
|
let bucket = garage
|
||||||
.bucket_helper()
|
.bucket_helper()
|
||||||
.get_existing_bucket(bucket_id)
|
.resolve_bucket_fast(&bucket_name, &api_key)
|
||||||
.await
|
.map_err(pass_helper_error)?;
|
||||||
.map_err(helper_error_as_internal)?;
|
let bucket_id = bucket.id;
|
||||||
let bucket_params = bucket.state.into_option().unwrap();
|
let bucket_params = bucket.state.into_option().unwrap();
|
||||||
|
|
||||||
let allowed = match endpoint.authorization_type() {
|
let allowed = match endpoint.authorization_type() {
|
||||||
|
|
|
@ -2,8 +2,8 @@ use err_derive::Error;
|
||||||
use hyper::header::HeaderValue;
|
use hyper::header::HeaderValue;
|
||||||
use hyper::{HeaderMap, StatusCode};
|
use hyper::{HeaderMap, StatusCode};
|
||||||
|
|
||||||
|
pub(crate) use garage_api_common::common_error::pass_helper_error;
|
||||||
use garage_api_common::common_error::{commonErrorDerivative, CommonError};
|
use garage_api_common::common_error::{commonErrorDerivative, CommonError};
|
||||||
pub(crate) use garage_api_common::common_error::{helper_error_as_internal, pass_helper_error};
|
|
||||||
pub use garage_api_common::common_error::{
|
pub use garage_api_common::common_error::{
|
||||||
CommonErrorDerivative, OkOrBadRequest, OkOrInternalError,
|
CommonErrorDerivative, OkOrBadRequest, OkOrInternalError,
|
||||||
};
|
};
|
||||||
|
|
|
@ -118,11 +118,11 @@ impl ApiHandler for S3ApiServer {
|
||||||
return handle_post_object(garage, req, bucket_name.unwrap()).await;
|
return handle_post_object(garage, req, bucket_name.unwrap()).await;
|
||||||
}
|
}
|
||||||
if let Endpoint::Options = endpoint {
|
if let Endpoint::Options = endpoint {
|
||||||
let options_res = handle_options_api(garage, &req, bucket_name).await?;
|
let options_res = handle_options_api(garage, &req, bucket_name)?;
|
||||||
return Ok(options_res.map(|_empty_body: EmptyBody| empty_body()));
|
return Ok(options_res.map(|_empty_body: EmptyBody| empty_body()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let verified_request = verify_request(&garage, req, "s3").await?;
|
let verified_request = verify_request(&garage, req, "s3")?;
|
||||||
let req = verified_request.request;
|
let req = verified_request.request;
|
||||||
let api_key = verified_request.access_key;
|
let api_key = verified_request.access_key;
|
||||||
|
|
||||||
|
@ -140,15 +140,11 @@ impl ApiHandler for S3ApiServer {
|
||||||
return handle_create_bucket(&garage, req, &api_key.key_id, bucket_name).await;
|
return handle_create_bucket(&garage, req, &api_key.key_id, bucket_name).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
let bucket_id = garage
|
|
||||||
.bucket_helper()
|
|
||||||
.resolve_bucket(&bucket_name, &api_key)
|
|
||||||
.await
|
|
||||||
.map_err(pass_helper_error)?;
|
|
||||||
let bucket = garage
|
let bucket = garage
|
||||||
.bucket_helper()
|
.bucket_helper()
|
||||||
.get_existing_bucket(bucket_id)
|
.resolve_bucket_fast(&bucket_name, &api_key)
|
||||||
.await?;
|
.map_err(pass_helper_error)?;
|
||||||
|
let bucket_id = bucket.id;
|
||||||
let bucket_params = bucket.state.into_option().unwrap();
|
let bucket_params = bucket.state.into_option().unwrap();
|
||||||
|
|
||||||
let allowed = match endpoint.authorization_type() {
|
let allowed = match endpoint.authorization_type() {
|
||||||
|
|
|
@ -143,21 +143,16 @@ pub async fn handle_create_bucket(
|
||||||
let api_key = helper.key().get_existing_key(api_key_id).await?;
|
let api_key = helper.key().get_existing_key(api_key_id).await?;
|
||||||
let key_params = api_key.params().unwrap();
|
let key_params = api_key.params().unwrap();
|
||||||
|
|
||||||
let existing_bucket = if let Some(Some(bucket_id)) = key_params.local_aliases.get(&bucket_name)
|
let existing_bucket = helper
|
||||||
{
|
.bucket()
|
||||||
Some(*bucket_id)
|
.resolve_bucket(&bucket_name, &api_key.key_id)
|
||||||
} else {
|
.await?;
|
||||||
helper
|
|
||||||
.bucket()
|
|
||||||
.resolve_global_bucket_name(&bucket_name)
|
|
||||||
.await?
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(bucket_id) = existing_bucket {
|
if let Some(bucket) = existing_bucket {
|
||||||
// Check we have write or owner permission on the bucket,
|
// Check we have write or owner permission on the bucket,
|
||||||
// in that case it's fine, return 200 OK, bucket exists;
|
// in that case it's fine, return 200 OK, bucket exists;
|
||||||
// otherwise return a forbidden error.
|
// otherwise return a forbidden error.
|
||||||
let kp = api_key.bucket_permissions(&bucket_id);
|
let kp = api_key.bucket_permissions(&bucket.id);
|
||||||
if !(kp.allow_write || kp.allow_owner) {
|
if !(kp.allow_write || kp.allow_owner) {
|
||||||
return Err(CommonError::BucketAlreadyExists.into());
|
return Err(CommonError::BucketAlreadyExists.into());
|
||||||
}
|
}
|
||||||
|
|
|
@ -683,16 +683,15 @@ async fn get_copy_source(ctx: &ReqCtx, req: &Request<ReqBody>) -> Result<Object,
|
||||||
let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?;
|
let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?;
|
||||||
|
|
||||||
let (source_bucket, source_key) = parse_bucket_key(©_source, None)?;
|
let (source_bucket, source_key) = parse_bucket_key(©_source, None)?;
|
||||||
let source_bucket_id = garage
|
let source_bucket = garage
|
||||||
.bucket_helper()
|
.bucket_helper()
|
||||||
.resolve_bucket(&source_bucket.to_string(), api_key)
|
.resolve_bucket_fast(&source_bucket.to_string(), api_key)
|
||||||
.await
|
|
||||||
.map_err(pass_helper_error)?;
|
.map_err(pass_helper_error)?;
|
||||||
|
|
||||||
if !api_key.allow_read(&source_bucket_id) {
|
if !api_key.allow_read(&source_bucket.id) {
|
||||||
return Err(Error::forbidden(format!(
|
return Err(Error::forbidden(format!(
|
||||||
"Reading from bucket {} not allowed for this key",
|
"Reading from bucket {:?} not allowed for this key",
|
||||||
source_bucket
|
source_bucket.id
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -700,7 +699,7 @@ async fn get_copy_source(ctx: &ReqCtx, req: &Request<ReqBody>) -> Result<Object,
|
||||||
|
|
||||||
let source_object = garage
|
let source_object = garage
|
||||||
.object_table
|
.object_table
|
||||||
.get(&source_bucket_id, &source_key.to_string())
|
.get(&source_bucket.id, &source_key.to_string())
|
||||||
.await?
|
.await?
|
||||||
.ok_or(Error::NoSuchKey)?;
|
.ok_or(Error::NoSuchKey)?;
|
||||||
|
|
||||||
|
|
|
@ -104,22 +104,18 @@ pub async fn handle_post_object(
|
||||||
key.to_owned()
|
key.to_owned()
|
||||||
};
|
};
|
||||||
|
|
||||||
let api_key = verify_v4(&garage, "s3", &authorization, policy.as_bytes()).await?;
|
let api_key = verify_v4(&garage, "s3", &authorization, policy.as_bytes())?;
|
||||||
|
|
||||||
let bucket_id = garage
|
let bucket = garage
|
||||||
.bucket_helper()
|
.bucket_helper()
|
||||||
.resolve_bucket(&bucket_name, &api_key)
|
.resolve_bucket_fast(&bucket_name, &api_key)
|
||||||
.await
|
|
||||||
.map_err(pass_helper_error)?;
|
.map_err(pass_helper_error)?;
|
||||||
|
let bucket_id = bucket.id;
|
||||||
|
|
||||||
if !api_key.allow_write(&bucket_id) {
|
if !api_key.allow_write(&bucket_id) {
|
||||||
return Err(Error::forbidden("Operation is not allowed for this key."));
|
return Err(Error::forbidden("Operation is not allowed for this key."));
|
||||||
}
|
}
|
||||||
|
|
||||||
let bucket = garage
|
|
||||||
.bucket_helper()
|
|
||||||
.get_existing_bucket(bucket_id)
|
|
||||||
.await?;
|
|
||||||
let bucket_params = bucket.state.into_option().unwrap();
|
let bucket_params = bucket.state.into_option().unwrap();
|
||||||
let matching_cors_rule = find_matching_cors_rule(
|
let matching_cors_rule = find_matching_cors_rule(
|
||||||
&bucket_params,
|
&bucket_params,
|
||||||
|
|
|
@ -18,7 +18,6 @@ garage_db.workspace = true
|
||||||
garage_net.workspace = true
|
garage_net.workspace = true
|
||||||
garage_rpc.workspace = true
|
garage_rpc.workspace = true
|
||||||
garage_util.workspace = true
|
garage_util.workspace = true
|
||||||
garage_table.workspace = true
|
|
||||||
|
|
||||||
opentelemetry.workspace = true
|
opentelemetry.workspace = true
|
||||||
|
|
||||||
|
|
|
@ -33,8 +33,6 @@ use garage_rpc::rpc_helper::OrderTag;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
use garage_rpc::*;
|
use garage_rpc::*;
|
||||||
|
|
||||||
use garage_table::replication::{TableReplication, TableShardedReplication};
|
|
||||||
|
|
||||||
use crate::block::*;
|
use crate::block::*;
|
||||||
use crate::layout::*;
|
use crate::layout::*;
|
||||||
use crate::metrics::*;
|
use crate::metrics::*;
|
||||||
|
@ -74,8 +72,8 @@ impl Rpc for BlockRpc {
|
||||||
|
|
||||||
/// The block manager, handling block exchange between nodes, and block storage on local node
|
/// The block manager, handling block exchange between nodes, and block storage on local node
|
||||||
pub struct BlockManager {
|
pub struct BlockManager {
|
||||||
/// Replication strategy, allowing to find on which node blocks should be located
|
/// Quorum of nodes for write operations
|
||||||
pub replication: TableShardedReplication,
|
pub write_quorum: usize,
|
||||||
|
|
||||||
/// Data layout
|
/// Data layout
|
||||||
pub(crate) data_layout: ArcSwap<DataLayout>,
|
pub(crate) data_layout: ArcSwap<DataLayout>,
|
||||||
|
@ -122,7 +120,7 @@ impl BlockManager {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
db: &db::Db,
|
db: &db::Db,
|
||||||
config: &Config,
|
config: &Config,
|
||||||
replication: TableShardedReplication,
|
write_quorum: usize,
|
||||||
system: Arc<System>,
|
system: Arc<System>,
|
||||||
) -> Result<Arc<Self>, Error> {
|
) -> Result<Arc<Self>, Error> {
|
||||||
// Load or compute layout, i.e. assignment of data blocks to the different data directories
|
// Load or compute layout, i.e. assignment of data blocks to the different data directories
|
||||||
|
@ -166,7 +164,7 @@ impl BlockManager {
|
||||||
let scrub_persister = PersisterShared::new(&system.metadata_dir, "scrub_info");
|
let scrub_persister = PersisterShared::new(&system.metadata_dir, "scrub_info");
|
||||||
|
|
||||||
let block_manager = Arc::new(Self {
|
let block_manager = Arc::new(Self {
|
||||||
replication,
|
write_quorum,
|
||||||
data_layout: ArcSwap::new(Arc::new(data_layout)),
|
data_layout: ArcSwap::new(Arc::new(data_layout)),
|
||||||
data_layout_persister,
|
data_layout_persister,
|
||||||
data_fsync: config.data_fsync,
|
data_fsync: config.data_fsync,
|
||||||
|
@ -338,6 +336,19 @@ impl BlockManager {
|
||||||
Err(err)
|
Err(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the set of nodes that should store a copy of a given block.
|
||||||
|
/// These are the nodes assigned to the block's hash in the current
|
||||||
|
/// layout version only: since blocks are immutable, we don't need to
|
||||||
|
/// do complex logic when several layour versions are active at once,
|
||||||
|
/// just move them directly to the new nodes.
|
||||||
|
pub(crate) fn storage_nodes_of(&self, hash: &Hash) -> Vec<Uuid> {
|
||||||
|
self.system
|
||||||
|
.cluster_layout()
|
||||||
|
.current()
|
||||||
|
.nodes_of(hash)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
// ---- Public interface ----
|
// ---- Public interface ----
|
||||||
|
|
||||||
/// Ask nodes that might have a block for it, return it as a stream
|
/// Ask nodes that might have a block for it, return it as a stream
|
||||||
|
@ -370,7 +381,7 @@ impl BlockManager {
|
||||||
prevent_compression: bool,
|
prevent_compression: bool,
|
||||||
order_tag: Option<OrderTag>,
|
order_tag: Option<OrderTag>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let who = self.system.cluster_layout().current_storage_nodes_of(&hash);
|
let who = self.storage_nodes_of(&hash);
|
||||||
|
|
||||||
let compression_level = self.compression_level.filter(|_| !prevent_compression);
|
let compression_level = self.compression_level.filter(|_| !prevent_compression);
|
||||||
let (header, bytes) = DataBlock::from_buffer(data, compression_level)
|
let (header, bytes) = DataBlock::from_buffer(data, compression_level)
|
||||||
|
@ -400,7 +411,7 @@ impl BlockManager {
|
||||||
put_block_rpc,
|
put_block_rpc,
|
||||||
RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
|
RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
|
||||||
.with_drop_on_completion(permit)
|
.with_drop_on_completion(permit)
|
||||||
.with_quorum(self.replication.write_quorum()),
|
.with_quorum(self.write_quorum),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
|
|
@ -27,8 +27,6 @@ use garage_util::tranquilizer::Tranquilizer;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
use garage_rpc::*;
|
use garage_rpc::*;
|
||||||
|
|
||||||
use garage_table::replication::TableReplication;
|
|
||||||
|
|
||||||
use crate::manager::*;
|
use crate::manager::*;
|
||||||
|
|
||||||
// The delay between the time where a resync operation fails
|
// The delay between the time where a resync operation fails
|
||||||
|
@ -377,11 +375,8 @@ impl BlockResyncManager {
|
||||||
info!("Resync block {:?}: offloading and deleting", hash);
|
info!("Resync block {:?}: offloading and deleting", hash);
|
||||||
let existing_path = existing_path.unwrap();
|
let existing_path = existing_path.unwrap();
|
||||||
|
|
||||||
let mut who = manager
|
let mut who = manager.storage_nodes_of(hash);
|
||||||
.system
|
if who.len() < manager.write_quorum {
|
||||||
.cluster_layout()
|
|
||||||
.current_storage_nodes_of(hash);
|
|
||||||
if who.len() < manager.replication.write_quorum() {
|
|
||||||
return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
|
return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
|
||||||
}
|
}
|
||||||
who.retain(|id| *id != manager.system.id);
|
who.retain(|id| *id != manager.system.id);
|
||||||
|
@ -463,10 +458,7 @@ impl BlockResyncManager {
|
||||||
|
|
||||||
// First, check whether we are still supposed to store that
|
// First, check whether we are still supposed to store that
|
||||||
// block in the latest cluster layout version.
|
// block in the latest cluster layout version.
|
||||||
let storage_nodes = manager
|
let storage_nodes = manager.storage_nodes_of(&hash);
|
||||||
.system
|
|
||||||
.cluster_layout()
|
|
||||||
.current_storage_nodes_of(&hash);
|
|
||||||
|
|
||||||
if !storage_nodes.contains(&manager.system.id) {
|
if !storage_nodes.contains(&manager.system.id) {
|
||||||
info!(
|
info!(
|
||||||
|
|
|
@ -154,13 +154,6 @@ impl Garage {
|
||||||
info!("Initialize membership management system...");
|
info!("Initialize membership management system...");
|
||||||
let system = System::new(network_key, replication_factor, consistency_mode, &config)?;
|
let system = System::new(network_key, replication_factor, consistency_mode, &config)?;
|
||||||
|
|
||||||
let data_rep_param = TableShardedReplication {
|
|
||||||
system: system.clone(),
|
|
||||||
replication_factor: replication_factor.into(),
|
|
||||||
write_quorum: replication_factor.write_quorum(consistency_mode),
|
|
||||||
read_quorum: 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
let meta_rep_param = TableShardedReplication {
|
let meta_rep_param = TableShardedReplication {
|
||||||
system: system.clone(),
|
system: system.clone(),
|
||||||
replication_factor: replication_factor.into(),
|
replication_factor: replication_factor.into(),
|
||||||
|
@ -173,7 +166,8 @@ impl Garage {
|
||||||
};
|
};
|
||||||
|
|
||||||
info!("Initialize block manager...");
|
info!("Initialize block manager...");
|
||||||
let block_manager = BlockManager::new(&db, &config, data_rep_param, system.clone())?;
|
let block_write_quorum = replication_factor.write_quorum(consistency_mode);
|
||||||
|
let block_manager = BlockManager::new(&db, &config, block_write_quorum, system.clone())?;
|
||||||
block_manager.register_bg_vars(&mut bg_vars);
|
block_manager.register_bg_vars(&mut bg_vars);
|
||||||
|
|
||||||
// ---- admin tables ----
|
// ---- admin tables ----
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
use garage_util::error::OkOrMessage;
|
use garage_util::error::{Error as GarageError, OkOrMessage};
|
||||||
use garage_util::time::*;
|
use garage_util::time::*;
|
||||||
|
|
||||||
use garage_table::util::*;
|
use garage_table::util::*;
|
||||||
|
@ -16,61 +16,172 @@ pub struct BucketHelper<'a>(pub(crate) &'a Garage);
|
||||||
|
|
||||||
#[allow(clippy::ptr_arg)]
|
#[allow(clippy::ptr_arg)]
|
||||||
impl<'a> BucketHelper<'a> {
|
impl<'a> BucketHelper<'a> {
|
||||||
pub async fn resolve_global_bucket_name(
|
// ================
|
||||||
|
// Local functions to find buckets FAST.
|
||||||
|
// This is only for the fast path in API requests.
|
||||||
|
// They do not provide the read-after-write guarantee
|
||||||
|
// when used in conjunction with other operations that
|
||||||
|
// modify buckets and bucket aliases.
|
||||||
|
// ================
|
||||||
|
|
||||||
|
/// Return bucket corresponding to global bucket name, if it exists
|
||||||
|
/// (and is not a tombstone entry).
|
||||||
|
///
|
||||||
|
/// The name can be of two forms:
|
||||||
|
/// 1. A global bucket alias
|
||||||
|
/// 2. The full ID of a bucket encoded in hex
|
||||||
|
///
|
||||||
|
/// Note that there is no possible ambiguity between the two forms,
|
||||||
|
/// as the maximum length of a bucket name is 63 characters, and the full
|
||||||
|
/// hex id is 64 chars long.
|
||||||
|
///
|
||||||
|
/// This will not do any network interaction to check the alias and
|
||||||
|
/// bucket tables, it will only check the local copy of the table.
|
||||||
|
/// As a consequence, it does not provide read-after-write guarantees.
|
||||||
|
pub fn resolve_global_bucket_fast(
|
||||||
&self,
|
&self,
|
||||||
bucket_name: &String,
|
bucket_name: &String,
|
||||||
) -> Result<Option<Uuid>, Error> {
|
) -> Result<Option<Bucket>, GarageError> {
|
||||||
// Bucket names in Garage are aliases, true bucket identifiers
|
|
||||||
// are 32-byte UUIDs. This function resolves bucket names into
|
|
||||||
// their full identifier by looking up in the bucket_alias_table.
|
|
||||||
// This function also allows buckets to be identified by their
|
|
||||||
// full UUID (hex-encoded). Here, if the name to be resolved is a
|
|
||||||
// hex string of the correct length, it is directly parsed as a bucket
|
|
||||||
// identifier which is returned. There is no risk of this conflicting
|
|
||||||
// with an actual bucket name: bucket names are max 63 chars long by
|
|
||||||
// the AWS spec, and hex-encoded UUIDs are 64 chars long.
|
|
||||||
let hexbucket = hex::decode(bucket_name.as_str())
|
let hexbucket = hex::decode(bucket_name.as_str())
|
||||||
.ok()
|
.ok()
|
||||||
.and_then(|by| Uuid::try_from(&by));
|
.and_then(|by| Uuid::try_from(&by));
|
||||||
if let Some(bucket_id) = hexbucket {
|
let bucket_id = match hexbucket {
|
||||||
Ok(self
|
Some(id) => id,
|
||||||
.0
|
None => {
|
||||||
.bucket_table
|
let alias = self
|
||||||
.get(&EmptyKey, &bucket_id)
|
.0
|
||||||
.await?
|
.bucket_alias_table
|
||||||
.filter(|x| !x.state.is_deleted())
|
.get_local(&EmptyKey, bucket_name)?
|
||||||
.map(|_| bucket_id))
|
.and_then(|x| *x.state.get());
|
||||||
} else {
|
match alias {
|
||||||
Ok(self
|
Some(id) => id,
|
||||||
.0
|
None => return Ok(None),
|
||||||
.bucket_alias_table
|
}
|
||||||
.get(&EmptyKey, bucket_name)
|
}
|
||||||
.await?
|
};
|
||||||
.and_then(|x| *x.state.get()))
|
Ok(self
|
||||||
}
|
.0
|
||||||
|
.bucket_table
|
||||||
|
.get_local(&EmptyKey, &bucket_id)?
|
||||||
|
.filter(|x| !x.state.is_deleted()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return bucket corresponding to a bucket name from the perspective of
|
||||||
|
/// a given access key, if it exists (and is not a tombstone entry).
|
||||||
|
///
|
||||||
|
/// The name can be of three forms:
|
||||||
|
/// 1. A global bucket alias
|
||||||
|
/// 2. A local bucket alias
|
||||||
|
/// 3. The full ID of a bucket encoded in hex
|
||||||
|
///
|
||||||
|
/// This will not do any network interaction, it will only check the local
|
||||||
|
/// copy of the bucket and global alias table. It will also resolve local
|
||||||
|
/// aliases directly using the data provided in the `api_key` parameter.
|
||||||
|
/// As a consequence, it does not provide read-after-write guarantees.
|
||||||
|
///
|
||||||
|
/// In case no such bucket is found, this function returns a NoSuchBucket error.
|
||||||
#[allow(clippy::ptr_arg)]
|
#[allow(clippy::ptr_arg)]
|
||||||
pub async fn resolve_bucket(&self, bucket_name: &String, api_key: &Key) -> Result<Uuid, Error> {
|
pub fn resolve_bucket_fast(
|
||||||
|
&self,
|
||||||
|
bucket_name: &String,
|
||||||
|
api_key: &Key,
|
||||||
|
) -> Result<Bucket, Error> {
|
||||||
let api_key_params = api_key
|
let api_key_params = api_key
|
||||||
.state
|
.state
|
||||||
.as_option()
|
.as_option()
|
||||||
.ok_or_message("Key should not be deleted at this point")?;
|
.ok_or_message("Key should not be deleted at this point")?;
|
||||||
|
|
||||||
if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) {
|
let bucket_opt =
|
||||||
Ok(*bucket_id)
|
if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) {
|
||||||
} else {
|
self.0
|
||||||
|
.bucket_table
|
||||||
|
.get_local(&EmptyKey, &bucket_id)?
|
||||||
|
.filter(|x| !x.state.is_deleted())
|
||||||
|
} else {
|
||||||
|
self.resolve_global_bucket_fast(bucket_name)?
|
||||||
|
};
|
||||||
|
bucket_opt.ok_or_else(|| Error::NoSuchBucket(bucket_name.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ================
|
||||||
|
// Global functions that do quorum reads/writes,
|
||||||
|
// for admin operations.
|
||||||
|
// ================
|
||||||
|
|
||||||
|
/// This is the same as `resolve_global_bucket_fast`,
|
||||||
|
/// except that it does quorum reads to ensure consistency.
|
||||||
|
pub async fn resolve_global_bucket(
|
||||||
|
&self,
|
||||||
|
bucket_name: &String,
|
||||||
|
) -> Result<Option<Bucket>, GarageError> {
|
||||||
|
let hexbucket = hex::decode(bucket_name.as_str())
|
||||||
|
.ok()
|
||||||
|
.and_then(|by| Uuid::try_from(&by));
|
||||||
|
let bucket_id = match hexbucket {
|
||||||
|
Some(id) => id,
|
||||||
|
None => {
|
||||||
|
let alias = self
|
||||||
|
.0
|
||||||
|
.bucket_alias_table
|
||||||
|
.get(&EmptyKey, bucket_name)
|
||||||
|
.await?
|
||||||
|
.and_then(|x| *x.state.get());
|
||||||
|
match alias {
|
||||||
|
Some(id) => id,
|
||||||
|
None => return Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Ok(self
|
||||||
|
.0
|
||||||
|
.bucket_table
|
||||||
|
.get(&EmptyKey, &bucket_id)
|
||||||
|
.await?
|
||||||
|
.filter(|x| !x.state.is_deleted()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return bucket corresponding to a bucket name from the perspective of
|
||||||
|
/// a given access key, if it exists (and is not a tombstone entry).
|
||||||
|
///
|
||||||
|
/// This is the same as `resolve_bucket_fast`, with the following differences:
|
||||||
|
///
|
||||||
|
/// - this function does quorum reads to ensure consistency.
|
||||||
|
/// - this function fetches the Key entry from the key table to ensure up-to-date data
|
||||||
|
/// - this function returns None if the bucket is not found, instead of HelperError::NoSuchBucket
|
||||||
|
#[allow(clippy::ptr_arg)]
|
||||||
|
pub async fn resolve_bucket(
|
||||||
|
&self,
|
||||||
|
bucket_name: &String,
|
||||||
|
key_id: &String,
|
||||||
|
) -> Result<Option<Bucket>, GarageError> {
|
||||||
|
let local_alias = self
|
||||||
|
.0
|
||||||
|
.key_table
|
||||||
|
.get(&EmptyKey, &key_id)
|
||||||
|
.await?
|
||||||
|
.and_then(|k| k.state.into_option())
|
||||||
|
.ok_or_else(|| GarageError::Message(format!("access key {} has been deleted", key_id)))?
|
||||||
|
.local_aliases
|
||||||
|
.get(bucket_name)
|
||||||
|
.copied()
|
||||||
|
.flatten();
|
||||||
|
|
||||||
|
if let Some(bucket_id) = local_alias {
|
||||||
Ok(self
|
Ok(self
|
||||||
.resolve_global_bucket_name(bucket_name)
|
.0
|
||||||
|
.bucket_table
|
||||||
|
.get(&EmptyKey, &bucket_id)
|
||||||
.await?
|
.await?
|
||||||
.ok_or_else(|| Error::NoSuchBucket(bucket_name.to_string()))?)
|
.filter(|x| !x.state.is_deleted()))
|
||||||
|
} else {
|
||||||
|
Ok(self.resolve_global_bucket(bucket_name).await?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a Bucket if it is present in bucket table,
|
/// Returns a Bucket if it is present in bucket table,
|
||||||
/// even if it is in deleted state. Querying a non-existing
|
/// even if it is in deleted state. Querying a non-existing
|
||||||
/// bucket ID returns an internal error.
|
/// bucket ID returns an internal error.
|
||||||
pub async fn get_internal_bucket(&self, bucket_id: Uuid) -> Result<Bucket, Error> {
|
pub(crate) async fn get_internal_bucket(&self, bucket_id: Uuid) -> Result<Bucket, Error> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.0
|
.0
|
||||||
.bucket_table
|
.bucket_table
|
||||||
|
|
|
@ -6,6 +6,7 @@ use garage_util::time::*;
|
||||||
use garage_table::util::*;
|
use garage_table::util::*;
|
||||||
|
|
||||||
use crate::bucket_alias_table::*;
|
use crate::bucket_alias_table::*;
|
||||||
|
use crate::bucket_table::*;
|
||||||
use crate::garage::Garage;
|
use crate::garage::Garage;
|
||||||
use crate::helper::bucket::BucketHelper;
|
use crate::helper::bucket::BucketHelper;
|
||||||
use crate::helper::error::*;
|
use crate::helper::error::*;
|
||||||
|
@ -56,7 +57,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
&self,
|
&self,
|
||||||
bucket_id: Uuid,
|
bucket_id: Uuid,
|
||||||
alias_name: &String,
|
alias_name: &String,
|
||||||
) -> Result<(), Error> {
|
) -> Result<Bucket, Error> {
|
||||||
if !is_valid_bucket_name(alias_name) {
|
if !is_valid_bucket_name(alias_name) {
|
||||||
return Err(Error::InvalidBucketName(alias_name.to_string()));
|
return Err(Error::InvalidBucketName(alias_name.to_string()));
|
||||||
}
|
}
|
||||||
|
@ -100,7 +101,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
bucket_p.aliases = LwwMap::raw_item(alias_name.clone(), alias_ts, true);
|
bucket_p.aliases = LwwMap::raw_item(alias_name.clone(), alias_ts, true);
|
||||||
self.0.bucket_table.insert(&bucket).await?;
|
self.0.bucket_table.insert(&bucket).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(bucket)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Unsets an alias for a bucket in global namespace.
|
/// Unsets an alias for a bucket in global namespace.
|
||||||
|
@ -112,7 +113,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
&self,
|
&self,
|
||||||
bucket_id: Uuid,
|
bucket_id: Uuid,
|
||||||
alias_name: &String,
|
alias_name: &String,
|
||||||
) -> Result<(), Error> {
|
) -> Result<Bucket, Error> {
|
||||||
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?;
|
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?;
|
||||||
let bucket_state = bucket.state.as_option_mut().unwrap();
|
let bucket_state = bucket.state.as_option_mut().unwrap();
|
||||||
|
|
||||||
|
@ -156,7 +157,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
bucket_state.aliases = LwwMap::raw_item(alias_name.clone(), alias_ts, false);
|
bucket_state.aliases = LwwMap::raw_item(alias_name.clone(), alias_ts, false);
|
||||||
self.0.bucket_table.insert(&bucket).await?;
|
self.0.bucket_table.insert(&bucket).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(bucket)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ensures a bucket does not have a certain global alias.
|
/// Ensures a bucket does not have a certain global alias.
|
||||||
|
@ -215,7 +216,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
bucket_id: Uuid,
|
bucket_id: Uuid,
|
||||||
key_id: &String,
|
key_id: &String,
|
||||||
alias_name: &String,
|
alias_name: &String,
|
||||||
) -> Result<(), Error> {
|
) -> Result<Bucket, Error> {
|
||||||
let key_helper = KeyHelper(self.0);
|
let key_helper = KeyHelper(self.0);
|
||||||
|
|
||||||
if !is_valid_bucket_name(alias_name) {
|
if !is_valid_bucket_name(alias_name) {
|
||||||
|
@ -257,7 +258,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
bucket_p.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, true);
|
bucket_p.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, true);
|
||||||
self.0.bucket_table.insert(&bucket).await?;
|
self.0.bucket_table.insert(&bucket).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(bucket)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Unsets an alias for a bucket in the local namespace of a key.
|
/// Unsets an alias for a bucket in the local namespace of a key.
|
||||||
|
@ -271,7 +272,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
bucket_id: Uuid,
|
bucket_id: Uuid,
|
||||||
key_id: &String,
|
key_id: &String,
|
||||||
alias_name: &String,
|
alias_name: &String,
|
||||||
) -> Result<(), Error> {
|
) -> Result<Bucket, Error> {
|
||||||
let key_helper = KeyHelper(self.0);
|
let key_helper = KeyHelper(self.0);
|
||||||
|
|
||||||
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?;
|
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?;
|
||||||
|
@ -330,7 +331,7 @@ impl<'a> LockedHelper<'a> {
|
||||||
bucket_p.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, false);
|
bucket_p.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, false);
|
||||||
self.0.bucket_table.insert(&bucket).await?;
|
self.0.bucket_table.insert(&bucket).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(bucket)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets permissions for a key on a bucket.
|
/// Sets permissions for a key on a bucket.
|
||||||
|
|
|
@ -451,10 +451,7 @@ impl K2VRpcHandler {
|
||||||
|
|
||||||
let mut value = self
|
let mut value = self
|
||||||
.item_table
|
.item_table
|
||||||
.data
|
.get_local(&key.partition, &key.sort_key)?
|
||||||
.read_entry(&key.partition, &key.sort_key)?
|
|
||||||
.map(|bytes| self.item_table.data.decode_entry(&bytes[..]))
|
|
||||||
.transpose()?
|
|
||||||
.unwrap_or_else(|| {
|
.unwrap_or_else(|| {
|
||||||
K2VItem::new(
|
K2VItem::new(
|
||||||
key.partition.bucket_id,
|
key.partition.bucket_id,
|
||||||
|
|
|
@ -149,14 +149,27 @@ impl LayoutHelper {
|
||||||
self.layout.as_ref().unwrap()
|
self.layout.as_ref().unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the current layout version
|
||||||
pub fn current(&self) -> &LayoutVersion {
|
pub fn current(&self) -> &LayoutVersion {
|
||||||
self.inner().current()
|
self.inner().current()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns all layout versions currently active in the cluster
|
||||||
pub fn versions(&self) -> &[LayoutVersion] {
|
pub fn versions(&self) -> &[LayoutVersion] {
|
||||||
&self.inner().versions
|
&self.inner().versions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the latest layout version for which it is safe to read data from,
|
||||||
|
/// i.e. the version whose version number is sync_map_min
|
||||||
|
pub fn read_version(&self) -> &LayoutVersion {
|
||||||
|
let sync_min = self.sync_map_min;
|
||||||
|
self.versions()
|
||||||
|
.iter()
|
||||||
|
.find(|x| x.version == sync_min)
|
||||||
|
.or(self.versions().last())
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn is_check_ok(&self) -> bool {
|
pub fn is_check_ok(&self) -> bool {
|
||||||
self.is_check_ok
|
self.is_check_ok
|
||||||
}
|
}
|
||||||
|
@ -181,6 +194,8 @@ impl LayoutHelper {
|
||||||
self.sync_map_min
|
self.sync_map_min
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---- helpers for layout synchronization ----
|
||||||
|
|
||||||
pub fn sync_digest(&self) -> SyncLayoutDigest {
|
pub fn sync_digest(&self) -> SyncLayoutDigest {
|
||||||
SyncLayoutDigest {
|
SyncLayoutDigest {
|
||||||
current: self.current().version,
|
current: self.current().version,
|
||||||
|
@ -189,50 +204,7 @@ impl LayoutHelper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn read_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
|
pub(crate) fn digest(&self) -> RpcLayoutDigest {
|
||||||
let sync_min = self.sync_map_min;
|
|
||||||
let version = self
|
|
||||||
.versions()
|
|
||||||
.iter()
|
|
||||||
.find(|x| x.version == sync_min)
|
|
||||||
.or(self.versions().last())
|
|
||||||
.unwrap();
|
|
||||||
version
|
|
||||||
.nodes_of(position, version.replication_factor)
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn storage_sets_of(&self, position: &Hash) -> Vec<Vec<Uuid>> {
|
|
||||||
self.versions()
|
|
||||||
.iter()
|
|
||||||
.map(|x| x.nodes_of(position, x.replication_factor).collect())
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn storage_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
|
|
||||||
let mut ret = vec![];
|
|
||||||
for version in self.versions().iter() {
|
|
||||||
ret.extend(version.nodes_of(position, version.replication_factor));
|
|
||||||
}
|
|
||||||
ret.sort();
|
|
||||||
ret.dedup();
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn current_storage_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
|
|
||||||
let ver = self.current();
|
|
||||||
ver.nodes_of(position, ver.replication_factor).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn trackers_hash(&self) -> Hash {
|
|
||||||
self.trackers_hash
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn staging_hash(&self) -> Hash {
|
|
||||||
self.staging_hash
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn digest(&self) -> RpcLayoutDigest {
|
|
||||||
RpcLayoutDigest {
|
RpcLayoutDigest {
|
||||||
current_version: self.current().version,
|
current_version: self.current().version,
|
||||||
active_versions: self.versions().len(),
|
active_versions: self.versions().len(),
|
||||||
|
|
|
@ -180,9 +180,7 @@ impl LayoutHistory {
|
||||||
|
|
||||||
// Determine set of nodes for partition p in layout version v.
|
// Determine set of nodes for partition p in layout version v.
|
||||||
// Sort the node set to avoid duplicate computations.
|
// Sort the node set to avoid duplicate computations.
|
||||||
let mut set = v
|
let mut set = v.nodes_of(&p_hash).collect::<Vec<Uuid>>();
|
||||||
.nodes_of(&p_hash, v.replication_factor)
|
|
||||||
.collect::<Vec<Uuid>>();
|
|
||||||
set.sort();
|
set.sort();
|
||||||
|
|
||||||
// If this set was already processed, skip it.
|
// If this set was already processed, skip it.
|
||||||
|
|
|
@ -143,16 +143,19 @@ impl LayoutManager {
|
||||||
|
|
||||||
// ---- ACK LOCKING ----
|
// ---- ACK LOCKING ----
|
||||||
|
|
||||||
pub fn write_sets_of(self: &Arc<Self>, position: &Hash) -> WriteLock<Vec<Vec<Uuid>>> {
|
pub fn write_lock_with<T, F>(self: &Arc<Self>, f: F) -> WriteLock<T>
|
||||||
|
where
|
||||||
|
F: FnOnce(&LayoutHelper) -> T,
|
||||||
|
{
|
||||||
let layout = self.layout();
|
let layout = self.layout();
|
||||||
let version = layout.current().version;
|
let version = layout.current().version;
|
||||||
let nodes = layout.storage_sets_of(position);
|
let value = f(&layout);
|
||||||
layout
|
layout
|
||||||
.ack_lock
|
.ack_lock
|
||||||
.get(&version)
|
.get(&version)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.fetch_add(1, Ordering::Relaxed);
|
.fetch_add(1, Ordering::Relaxed);
|
||||||
WriteLock::new(version, self, nodes)
|
WriteLock::new(version, self, value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- INTERNALS ---
|
// ---- INTERNALS ---
|
||||||
|
|
|
@ -114,9 +114,7 @@ impl LayoutVersion {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the n servers in which data for this hash should be replicated
|
/// Return the n servers in which data for this hash should be replicated
|
||||||
pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator<Item = Uuid> + '_ {
|
pub fn nodes_of(&self, position: &Hash) -> impl Iterator<Item = Uuid> + '_ {
|
||||||
assert_eq!(n, self.replication_factor);
|
|
||||||
|
|
||||||
let data = &self.ring_assignment_data;
|
let data = &self.ring_assignment_data;
|
||||||
|
|
||||||
let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) {
|
let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) {
|
||||||
|
|
|
@ -573,7 +573,7 @@ impl RpcHelper {
|
||||||
// Compute, for each layout version, the set of nodes that might store
|
// Compute, for each layout version, the set of nodes that might store
|
||||||
// the block, and put them in their preferred order as of `request_order`.
|
// the block, and put them in their preferred order as of `request_order`.
|
||||||
let mut vernodes = layout.versions().iter().map(|ver| {
|
let mut vernodes = layout.versions().iter().map(|ver| {
|
||||||
let nodes = ver.nodes_of(position, ver.replication_factor);
|
let nodes = ver.nodes_of(position);
|
||||||
rpc_helper.request_order(layout.current(), nodes)
|
rpc_helper.request_order(layout.current(), nodes)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -607,7 +607,7 @@ impl RpcHelper {
|
||||||
// Second step: add nodes of older layout versions
|
// Second step: add nodes of older layout versions
|
||||||
let old_ver_iter = layout.inner().old_versions.iter().rev();
|
let old_ver_iter = layout.inner().old_versions.iter().rev();
|
||||||
for ver in old_ver_iter {
|
for ver in old_ver_iter {
|
||||||
let nodes = ver.nodes_of(position, ver.replication_factor);
|
let nodes = ver.nodes_of(position);
|
||||||
for node in rpc_helper.request_order(layout.current(), nodes) {
|
for node in rpc_helper.request_order(layout.current(), nodes) {
|
||||||
if !ret.contains(&node) {
|
if !ret.contains(&node) {
|
||||||
ret.push(node);
|
ret.push(node);
|
||||||
|
|
|
@ -475,10 +475,7 @@ impl System {
|
||||||
let mut partitions_quorum = 0;
|
let mut partitions_quorum = 0;
|
||||||
let mut partitions_all_ok = 0;
|
let mut partitions_all_ok = 0;
|
||||||
for (_, hash) in partitions.iter() {
|
for (_, hash) in partitions.iter() {
|
||||||
let mut write_sets = layout
|
let mut write_sets = layout.versions().iter().map(|x| x.nodes_of(hash));
|
||||||
.versions()
|
|
||||||
.iter()
|
|
||||||
.map(|x| x.nodes_of(hash, x.replication_factor));
|
|
||||||
let has_quorum = write_sets
|
let has_quorum = write_sets
|
||||||
.clone()
|
.clone()
|
||||||
.all(|set| set.filter(|x| node_up(x)).count() >= quorum);
|
.all(|set| set.filter(|x| node_up(x)).count() >= quorum);
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use garage_rpc::layout::*;
|
use garage_rpc::layout::*;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
|
@ -24,29 +25,53 @@ pub struct TableFullReplication {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TableReplication for TableFullReplication {
|
impl TableReplication for TableFullReplication {
|
||||||
type WriteSets = Vec<Vec<Uuid>>;
|
type WriteSets = WriteLock<Vec<Vec<Uuid>>>;
|
||||||
|
|
||||||
|
// Do anti-entropy every 10 seconds.
|
||||||
|
// Compared to sharded tables, anti-entropy is much less costly as there is
|
||||||
|
// a single partition hash to exchange.
|
||||||
|
// Also, it's generally a much bigger problem for fullcopy tables to be out of sync.
|
||||||
|
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
fn storage_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
|
fn storage_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
|
||||||
let layout = self.system.cluster_layout();
|
self.system.cluster_layout().all_nodes().to_vec()
|
||||||
layout.current().all_nodes().to_vec()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
|
fn read_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
|
||||||
vec![self.system.id]
|
self.system
|
||||||
|
.cluster_layout()
|
||||||
|
.read_version()
|
||||||
|
.all_nodes()
|
||||||
|
.to_vec()
|
||||||
}
|
}
|
||||||
fn read_quorum(&self) -> usize {
|
fn read_quorum(&self) -> usize {
|
||||||
1
|
let layout = self.system.cluster_layout();
|
||||||
|
let nodes = layout.read_version().all_nodes();
|
||||||
|
nodes.len().div_euclid(2) + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
|
fn write_sets(&self, _hash: &Hash) -> Self::WriteSets {
|
||||||
vec![self.storage_nodes(hash)]
|
self.system.layout_manager.write_lock_with(write_sets)
|
||||||
}
|
}
|
||||||
fn write_quorum(&self) -> usize {
|
fn write_quorum(&self) -> usize {
|
||||||
let nmembers = self.system.cluster_layout().current().all_nodes().len();
|
let layout = self.system.cluster_layout();
|
||||||
if nmembers < 3 {
|
let min_len = layout
|
||||||
1
|
.versions()
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.all_nodes().len())
|
||||||
|
.min()
|
||||||
|
.unwrap();
|
||||||
|
let max_quorum = layout
|
||||||
|
.versions()
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.all_nodes().len().div_euclid(2) + 1)
|
||||||
|
.max()
|
||||||
|
.unwrap();
|
||||||
|
if min_len < max_quorum {
|
||||||
|
warn!("Write quorum will not be respected for TableFullReplication operations due to multiple active layout versions with vastly different number of nodes");
|
||||||
|
min_len
|
||||||
} else {
|
} else {
|
||||||
nmembers.div_euclid(2) + 1
|
max_quorum
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -56,15 +81,26 @@ impl TableReplication for TableFullReplication {
|
||||||
|
|
||||||
fn sync_partitions(&self) -> SyncPartitions {
|
fn sync_partitions(&self) -> SyncPartitions {
|
||||||
let layout = self.system.cluster_layout();
|
let layout = self.system.cluster_layout();
|
||||||
let layout_version = layout.current().version;
|
let layout_version = layout.ack_map_min();
|
||||||
|
|
||||||
|
let partitions = vec![SyncPartition {
|
||||||
|
partition: 0u16,
|
||||||
|
first_hash: [0u8; 32].into(),
|
||||||
|
last_hash: [0xff; 32].into(),
|
||||||
|
storage_sets: write_sets(&layout),
|
||||||
|
}];
|
||||||
|
|
||||||
SyncPartitions {
|
SyncPartitions {
|
||||||
layout_version,
|
layout_version,
|
||||||
partitions: vec![SyncPartition {
|
partitions,
|
||||||
partition: 0u16,
|
|
||||||
first_hash: [0u8; 32].into(),
|
|
||||||
last_hash: [0xff; 32].into(),
|
|
||||||
storage_sets: vec![layout.current().all_nodes().to_vec()],
|
|
||||||
}],
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn write_sets(layout: &LayoutHelper) -> Vec<Vec<Uuid>> {
|
||||||
|
layout
|
||||||
|
.versions()
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.all_nodes().to_vec())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use garage_rpc::layout::*;
|
use garage_rpc::layout::*;
|
||||||
use garage_util::data::*;
|
use garage_util::data::*;
|
||||||
|
|
||||||
|
@ -5,6 +7,8 @@ use garage_util::data::*;
|
||||||
pub trait TableReplication: Send + Sync + 'static {
|
pub trait TableReplication: Send + Sync + 'static {
|
||||||
type WriteSets: AsRef<Vec<Vec<Uuid>>> + AsMut<Vec<Vec<Uuid>>> + Send + Sync + 'static;
|
type WriteSets: AsRef<Vec<Vec<Uuid>>> + AsMut<Vec<Vec<Uuid>>> + Send + Sync + 'static;
|
||||||
|
|
||||||
|
const ANTI_ENTROPY_INTERVAL: Duration;
|
||||||
|
|
||||||
// See examples in table_sharded.rs and table_fullcopy.rs
|
// See examples in table_sharded.rs and table_fullcopy.rs
|
||||||
// To understand various replication methods
|
// To understand various replication methods
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use garage_rpc::layout::*;
|
use garage_rpc::layout::*;
|
||||||
use garage_rpc::system::System;
|
use garage_rpc::system::System;
|
||||||
|
@ -25,21 +26,37 @@ pub struct TableShardedReplication {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TableReplication for TableShardedReplication {
|
impl TableReplication for TableShardedReplication {
|
||||||
|
// Do anti-entropy every 10 minutes
|
||||||
|
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
|
||||||
|
|
||||||
type WriteSets = WriteLock<Vec<Vec<Uuid>>>;
|
type WriteSets = WriteLock<Vec<Vec<Uuid>>>;
|
||||||
|
|
||||||
fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid> {
|
fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid> {
|
||||||
self.system.cluster_layout().storage_nodes_of(hash)
|
let layout = self.system.cluster_layout();
|
||||||
|
let mut ret = vec![];
|
||||||
|
for version in layout.versions().iter() {
|
||||||
|
ret.extend(version.nodes_of(hash));
|
||||||
|
}
|
||||||
|
ret.sort();
|
||||||
|
ret.dedup();
|
||||||
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_nodes(&self, hash: &Hash) -> Vec<Uuid> {
|
fn read_nodes(&self, hash: &Hash) -> Vec<Uuid> {
|
||||||
self.system.cluster_layout().read_nodes_of(hash)
|
self.system
|
||||||
|
.cluster_layout()
|
||||||
|
.read_version()
|
||||||
|
.nodes_of(hash)
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
fn read_quorum(&self) -> usize {
|
fn read_quorum(&self) -> usize {
|
||||||
self.read_quorum
|
self.read_quorum
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
|
fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
|
||||||
self.system.layout_manager.write_sets_of(hash)
|
self.system
|
||||||
|
.layout_manager
|
||||||
|
.write_lock_with(|l| write_sets(l, hash))
|
||||||
}
|
}
|
||||||
fn write_quorum(&self) -> usize {
|
fn write_quorum(&self) -> usize {
|
||||||
self.write_quorum
|
self.write_quorum
|
||||||
|
@ -57,12 +74,11 @@ impl TableReplication for TableShardedReplication {
|
||||||
.current()
|
.current()
|
||||||
.partitions()
|
.partitions()
|
||||||
.map(|(partition, first_hash)| {
|
.map(|(partition, first_hash)| {
|
||||||
let storage_sets = layout.storage_sets_of(&first_hash);
|
|
||||||
SyncPartition {
|
SyncPartition {
|
||||||
partition,
|
partition,
|
||||||
first_hash,
|
first_hash,
|
||||||
last_hash: [0u8; 32].into(), // filled in just after
|
last_hash: [0u8; 32].into(), // filled in just after
|
||||||
storage_sets,
|
storage_sets: write_sets(&layout, &first_hash),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
@ -81,3 +97,11 @@ impl TableReplication for TableShardedReplication {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn write_sets(layout: &LayoutHelper, hash: &Hash) -> Vec<Vec<Uuid>> {
|
||||||
|
layout
|
||||||
|
.versions()
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.nodes_of(hash).collect())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
|
@ -27,9 +27,6 @@ use crate::merkle::*;
|
||||||
use crate::replication::*;
|
use crate::replication::*;
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
|
||||||
// Do anti-entropy every 10 minutes
|
|
||||||
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
|
|
||||||
|
|
||||||
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
|
pub struct TableSyncer<F: TableSchema, R: TableReplication> {
|
||||||
system: Arc<System>,
|
system: Arc<System>,
|
||||||
data: Arc<TableData<F, R>>,
|
data: Arc<TableData<F, R>>,
|
||||||
|
@ -514,7 +511,7 @@ impl<F: TableSchema, R: TableReplication> SyncWorker<F, R> {
|
||||||
|
|
||||||
partitions.partitions.shuffle(&mut thread_rng());
|
partitions.partitions.shuffle(&mut thread_rng());
|
||||||
self.todo = Some(partitions);
|
self.todo = Some(partitions);
|
||||||
self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL;
|
self.next_full_sync = Instant::now() + R::ANTI_ENTROPY_INTERVAL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -482,6 +482,15 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {
|
||||||
Ok(ret_vec)
|
Ok(ret_vec)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_local(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
partition_key: &F::P,
|
||||||
|
sort_key: &F::S,
|
||||||
|
) -> Result<Option<F::E>, Error> {
|
||||||
|
let bytes = self.data.read_entry(partition_key, sort_key)?;
|
||||||
|
bytes.map(|b| self.data.decode_entry(&b)).transpose()
|
||||||
|
}
|
||||||
|
|
||||||
// =============== UTILITY FUNCTION FOR CLIENT OPERATIONS ===============
|
// =============== UTILITY FUNCTION FOR CLIENT OPERATIONS ===============
|
||||||
|
|
||||||
async fn repair_on_read(&self, who: &[Uuid], what: F::E) -> Result<(), Error> {
|
async fn repair_on_read(&self, who: &[Uuid], what: F::E) -> Result<(), Error> {
|
||||||
|
|
Loading…
Add table
Reference in a new issue