2021-10-14 09:50:12 +00:00
|
|
|
//! Contain structs related to making RPCs
|
2023-11-14 14:40:46 +00:00
|
|
|
use std::collections::HashMap;
|
2023-11-09 13:12:05 +00:00
|
|
|
use std::sync::{Arc, RwLock};
|
2022-02-22 13:52:41 +00:00
|
|
|
use std::time::Duration;
|
2021-10-14 09:50:12 +00:00
|
|
|
|
|
|
|
use futures::future::join_all;
|
|
|
|
use futures::stream::futures_unordered::FuturesUnordered;
|
|
|
|
use futures::stream::StreamExt;
|
|
|
|
use tokio::select;
|
|
|
|
|
2022-02-17 22:28:23 +00:00
|
|
|
use opentelemetry::KeyValue;
|
|
|
|
use opentelemetry::{
|
|
|
|
trace::{FutureExt as OtelFutureExt, Span, TraceContextExt, Tracer},
|
|
|
|
Context,
|
|
|
|
};
|
|
|
|
|
2022-07-22 16:20:27 +00:00
|
|
|
pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler};
|
|
|
|
pub use netapp::message::{
|
2023-01-11 15:12:07 +00:00
|
|
|
IntoReq, Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH,
|
|
|
|
PRIO_NORMAL, PRIO_SECONDARY,
|
2022-07-22 16:20:27 +00:00
|
|
|
};
|
2021-10-14 09:50:12 +00:00
|
|
|
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
|
2022-07-22 16:20:27 +00:00
|
|
|
pub use netapp::{self, NetApp, NodeID};
|
2021-10-14 09:50:12 +00:00
|
|
|
|
2021-11-03 16:00:40 +00:00
|
|
|
use garage_util::data::*;
|
2021-10-19 14:16:10 +00:00
|
|
|
use garage_util::error::Error;
|
2022-02-22 12:53:59 +00:00
|
|
|
use garage_util::metrics::RecordDuration;
|
2021-10-14 09:50:12 +00:00
|
|
|
|
2023-11-15 13:20:50 +00:00
|
|
|
use crate::layout::LayoutHelper;
|
2022-02-16 13:23:04 +00:00
|
|
|
use crate::metrics::RpcMetrics;
|
2021-11-04 15:04:26 +00:00
|
|
|
|
2022-09-19 18:12:19 +00:00
|
|
|
// Default RPC timeout = 5 minutes
|
|
|
|
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
|
2021-11-03 16:00:40 +00:00
|
|
|
|
2021-10-14 09:50:12 +00:00
|
|
|
/// Strategy to apply when making RPC
|
|
|
|
#[derive(Copy, Clone)]
|
|
|
|
pub struct RequestStrategy {
|
|
|
|
/// Min number of response to consider the request successful
|
2023-11-14 14:40:46 +00:00
|
|
|
rs_quorum: Option<usize>,
|
|
|
|
/// Send all requests at once
|
|
|
|
rs_send_all_at_once: Option<bool>,
|
2021-10-14 09:50:12 +00:00
|
|
|
/// Request priority
|
2023-11-14 14:40:46 +00:00
|
|
|
rs_priority: RequestPriority,
|
2022-09-20 14:01:41 +00:00
|
|
|
/// Custom timeout for this request
|
|
|
|
rs_timeout: Timeout,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Copy, Clone)]
|
|
|
|
enum Timeout {
|
|
|
|
None,
|
|
|
|
Default,
|
|
|
|
Custom(Duration),
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl RequestStrategy {
|
|
|
|
/// Create a RequestStrategy with default timeout and not interrupting when quorum reached
|
|
|
|
pub fn with_priority(prio: RequestPriority) -> Self {
|
|
|
|
RequestStrategy {
|
|
|
|
rs_quorum: None,
|
2023-11-14 14:40:46 +00:00
|
|
|
rs_send_all_at_once: None,
|
2021-10-14 09:50:12 +00:00
|
|
|
rs_priority: prio,
|
2022-09-20 14:01:41 +00:00
|
|
|
rs_timeout: Timeout::Default,
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
/// Set quorum to be reached for request
|
|
|
|
pub fn with_quorum(mut self, quorum: usize) -> Self {
|
|
|
|
self.rs_quorum = Some(quorum);
|
|
|
|
self
|
|
|
|
}
|
2023-11-14 14:40:46 +00:00
|
|
|
/// Set quorum to be reached for request
|
|
|
|
pub fn send_all_at_once(mut self, value: bool) -> Self {
|
|
|
|
self.rs_send_all_at_once = Some(value);
|
2021-10-14 09:50:12 +00:00
|
|
|
self
|
|
|
|
}
|
2022-09-19 18:12:19 +00:00
|
|
|
/// Deactivate timeout for this request
|
|
|
|
pub fn without_timeout(mut self) -> Self {
|
2022-09-20 14:01:41 +00:00
|
|
|
self.rs_timeout = Timeout::None;
|
|
|
|
self
|
|
|
|
}
|
|
|
|
/// Set custom timeout for this request
|
|
|
|
pub fn with_custom_timeout(mut self, timeout: Duration) -> Self {
|
|
|
|
self.rs_timeout = Timeout::Custom(timeout);
|
2022-09-19 18:12:19 +00:00
|
|
|
self
|
|
|
|
}
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
2021-11-04 15:04:26 +00:00
|
|
|
pub struct RpcHelper(Arc<RpcHelperInner>);
|
|
|
|
|
|
|
|
struct RpcHelperInner {
|
|
|
|
our_node_id: Uuid,
|
|
|
|
fullmesh: Arc<FullMeshPeeringStrategy>,
|
2023-11-15 13:20:50 +00:00
|
|
|
layout: Arc<RwLock<LayoutHelper>>,
|
2022-02-16 13:23:04 +00:00
|
|
|
metrics: RpcMetrics,
|
2022-09-19 18:12:19 +00:00
|
|
|
rpc_timeout: Duration,
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl RpcHelper {
|
2021-11-03 16:00:40 +00:00
|
|
|
pub(crate) fn new(
|
2021-11-04 15:04:26 +00:00
|
|
|
our_node_id: Uuid,
|
2021-11-03 16:00:40 +00:00
|
|
|
fullmesh: Arc<FullMeshPeeringStrategy>,
|
2023-11-15 13:20:50 +00:00
|
|
|
layout: Arc<RwLock<LayoutHelper>>,
|
2022-09-19 18:12:19 +00:00
|
|
|
rpc_timeout: Option<Duration>,
|
2021-11-03 16:00:40 +00:00
|
|
|
) -> Self {
|
2022-09-19 18:12:19 +00:00
|
|
|
let metrics = RpcMetrics::new();
|
2022-02-16 13:23:04 +00:00
|
|
|
|
2021-11-04 15:04:26 +00:00
|
|
|
Self(Arc::new(RpcHelperInner {
|
|
|
|
our_node_id,
|
2021-11-03 16:00:40 +00:00
|
|
|
fullmesh,
|
2023-11-09 13:12:05 +00:00
|
|
|
layout,
|
2022-02-16 13:23:04 +00:00
|
|
|
metrics,
|
2022-09-19 18:12:19 +00:00
|
|
|
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
|
2021-11-04 15:04:26 +00:00
|
|
|
}))
|
2021-11-03 16:00:40 +00:00
|
|
|
}
|
|
|
|
|
2022-09-19 18:12:19 +00:00
|
|
|
pub fn rpc_timeout(&self) -> Duration {
|
|
|
|
self.0.rpc_timeout
|
|
|
|
}
|
|
|
|
|
2022-07-22 13:20:00 +00:00
|
|
|
pub async fn call<M, N, H, S>(
|
2021-10-14 09:50:12 +00:00
|
|
|
&self,
|
|
|
|
endpoint: &Endpoint<M, H>,
|
2021-10-15 09:05:09 +00:00
|
|
|
to: Uuid,
|
2022-07-22 13:20:00 +00:00
|
|
|
msg: N,
|
2021-10-14 09:50:12 +00:00
|
|
|
strat: RequestStrategy,
|
2021-10-15 09:05:09 +00:00
|
|
|
) -> Result<S, Error>
|
2021-10-14 09:50:12 +00:00
|
|
|
where
|
2021-10-15 09:05:09 +00:00
|
|
|
M: Rpc<Response = Result<S, Error>>,
|
2022-07-22 13:20:00 +00:00
|
|
|
N: IntoReq<M> + Send,
|
2022-07-22 16:20:27 +00:00
|
|
|
H: StreamingEndpointHandler<M>,
|
2021-10-14 09:50:12 +00:00
|
|
|
{
|
2023-11-28 10:12:39 +00:00
|
|
|
let tracer = opentelemetry::global::tracer("garage");
|
|
|
|
let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to);
|
|
|
|
let mut span = tracer.start(span_name);
|
|
|
|
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
|
|
|
|
span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
|
|
|
|
|
2022-02-24 13:59:49 +00:00
|
|
|
let metric_tags = [
|
|
|
|
KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
|
|
|
|
KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
|
|
|
|
KeyValue::new("to", format!("{:?}", to)),
|
|
|
|
];
|
2022-02-16 13:23:04 +00:00
|
|
|
|
|
|
|
self.0.metrics.rpc_counter.add(1, &metric_tags);
|
|
|
|
|
2021-10-15 09:05:09 +00:00
|
|
|
let node_id = to.into();
|
2022-02-22 13:52:41 +00:00
|
|
|
let rpc_call = endpoint
|
2022-07-22 13:20:00 +00:00
|
|
|
.call_streaming(&node_id, msg, strat.rs_priority)
|
2023-11-28 10:12:39 +00:00
|
|
|
.with_context(Context::current_with_span(span))
|
2022-02-22 12:53:59 +00:00
|
|
|
.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
|
2022-02-17 22:28:23 +00:00
|
|
|
|
2022-09-19 18:12:19 +00:00
|
|
|
let timeout = async {
|
2022-09-20 14:01:41 +00:00
|
|
|
match strat.rs_timeout {
|
|
|
|
Timeout::None => futures::future::pending().await,
|
|
|
|
Timeout::Default => tokio::time::sleep(self.0.rpc_timeout).await,
|
|
|
|
Timeout::Custom(t) => tokio::time::sleep(t).await,
|
2022-09-19 18:12:19 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-10-14 09:50:12 +00:00
|
|
|
select! {
|
2022-02-17 22:28:23 +00:00
|
|
|
res = rpc_call => {
|
2022-02-16 13:23:04 +00:00
|
|
|
if res.is_err() {
|
|
|
|
self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
|
|
|
|
}
|
2022-07-22 13:20:00 +00:00
|
|
|
let res = res?.into_msg();
|
2022-02-16 13:23:04 +00:00
|
|
|
|
|
|
|
if res.is_err() {
|
|
|
|
self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(res?)
|
2021-11-03 16:00:40 +00:00
|
|
|
}
|
2022-09-19 18:12:19 +00:00
|
|
|
() = timeout => {
|
2022-02-16 13:23:04 +00:00
|
|
|
self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
|
2021-11-03 16:00:40 +00:00
|
|
|
Err(Error::Timeout)
|
|
|
|
}
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-22 13:20:00 +00:00
|
|
|
pub async fn call_many<M, N, H, S>(
|
2021-10-14 09:50:12 +00:00
|
|
|
&self,
|
|
|
|
endpoint: &Endpoint<M, H>,
|
2021-10-15 09:05:09 +00:00
|
|
|
to: &[Uuid],
|
2022-07-22 13:20:00 +00:00
|
|
|
msg: N,
|
2021-10-14 09:50:12 +00:00
|
|
|
strat: RequestStrategy,
|
2022-07-22 13:20:00 +00:00
|
|
|
) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
|
2021-10-14 09:50:12 +00:00
|
|
|
where
|
2021-10-15 09:05:09 +00:00
|
|
|
M: Rpc<Response = Result<S, Error>>,
|
2022-07-22 13:20:00 +00:00
|
|
|
N: IntoReq<M>,
|
2022-07-22 16:20:27 +00:00
|
|
|
H: StreamingEndpointHandler<M>,
|
2021-10-14 09:50:12 +00:00
|
|
|
{
|
2023-11-28 10:12:39 +00:00
|
|
|
let tracer = opentelemetry::global::tracer("garage");
|
|
|
|
let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len());
|
|
|
|
let span = tracer.start(span_name);
|
|
|
|
|
2022-07-22 13:20:00 +00:00
|
|
|
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
|
|
|
|
2021-10-14 09:50:12 +00:00
|
|
|
let resps = join_all(
|
|
|
|
to.iter()
|
2022-07-22 13:20:00 +00:00
|
|
|
.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
|
2021-10-14 09:50:12 +00:00
|
|
|
)
|
2023-11-28 10:12:39 +00:00
|
|
|
.with_context(Context::current_with_span(span))
|
2021-10-14 09:50:12 +00:00
|
|
|
.await;
|
2022-07-22 13:20:00 +00:00
|
|
|
Ok(to
|
|
|
|
.iter()
|
2021-10-14 09:50:12 +00:00
|
|
|
.cloned()
|
|
|
|
.zip(resps.into_iter())
|
2022-07-22 13:20:00 +00:00
|
|
|
.collect::<Vec<_>>())
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
|
2022-07-22 13:20:00 +00:00
|
|
|
pub async fn broadcast<M, N, H, S>(
|
2021-10-14 09:50:12 +00:00
|
|
|
&self,
|
|
|
|
endpoint: &Endpoint<M, H>,
|
2022-07-22 13:20:00 +00:00
|
|
|
msg: N,
|
2021-10-14 09:50:12 +00:00
|
|
|
strat: RequestStrategy,
|
2022-07-22 13:20:00 +00:00
|
|
|
) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
|
2021-10-14 09:50:12 +00:00
|
|
|
where
|
2021-10-15 09:05:09 +00:00
|
|
|
M: Rpc<Response = Result<S, Error>>,
|
2022-07-22 13:20:00 +00:00
|
|
|
N: IntoReq<M>,
|
2022-07-22 16:20:27 +00:00
|
|
|
H: StreamingEndpointHandler<M>,
|
2021-10-14 09:50:12 +00:00
|
|
|
{
|
|
|
|
let to = self
|
2021-11-04 15:04:26 +00:00
|
|
|
.0
|
2021-10-14 09:50:12 +00:00
|
|
|
.fullmesh
|
|
|
|
.get_peer_list()
|
|
|
|
.iter()
|
2021-10-15 09:05:09 +00:00
|
|
|
.map(|p| p.id.into())
|
2021-10-14 09:50:12 +00:00
|
|
|
.collect::<Vec<_>>();
|
|
|
|
self.call_many(endpoint, &to[..], msg, strat).await
|
|
|
|
}
|
|
|
|
|
2021-11-04 15:04:26 +00:00
|
|
|
/// Make a RPC call to multiple servers, returning either a Vec of responses,
|
|
|
|
/// or an error if quorum could not be reached due to too many errors
|
2023-11-28 10:12:39 +00:00
|
|
|
///
|
|
|
|
/// If RequestStrategy has send_all_at_once set, then all requests will be
|
|
|
|
/// sent at once, and `try_call_many` will return as soon as a quorum of
|
|
|
|
/// responses is achieved, dropping and cancelling the remaining requests.
|
|
|
|
///
|
|
|
|
/// Otherwise, `quorum` requests will be sent at the same time, and if an
|
|
|
|
/// error response is received, a new request will be sent to replace it.
|
|
|
|
/// The ordering of nodes to which requests are sent is determined by
|
|
|
|
/// the `RpcHelper::request_order` function, which takes into account
|
|
|
|
/// parameters such as node zones and measured ping values.
|
|
|
|
///
|
|
|
|
/// In both cases, the basic contract of this function is that even in the
|
|
|
|
/// absence of failures, the RPC call might not be driven to completion
|
|
|
|
/// on all of the specified nodes. It is therefore unfit for broadcast
|
|
|
|
/// write operations where we expect all nodes to successfully store
|
|
|
|
/// the written date.
|
2022-07-22 16:20:27 +00:00
|
|
|
pub async fn try_call_many<M, N, H, S>(
|
2021-10-14 09:50:12 +00:00
|
|
|
&self,
|
|
|
|
endpoint: &Arc<Endpoint<M, H>>,
|
2021-10-15 09:05:09 +00:00
|
|
|
to: &[Uuid],
|
2022-07-22 16:20:27 +00:00
|
|
|
msg: N,
|
2021-10-14 09:50:12 +00:00
|
|
|
strategy: RequestStrategy,
|
2021-10-15 09:05:09 +00:00
|
|
|
) -> Result<Vec<S>, Error>
|
2021-10-14 09:50:12 +00:00
|
|
|
where
|
2021-10-15 09:05:09 +00:00
|
|
|
M: Rpc<Response = Result<S, Error>> + 'static,
|
2022-07-22 16:20:27 +00:00
|
|
|
N: IntoReq<M>,
|
|
|
|
H: StreamingEndpointHandler<M> + 'static,
|
2022-02-17 22:28:23 +00:00
|
|
|
S: Send + 'static,
|
2021-10-14 09:50:12 +00:00
|
|
|
{
|
2021-11-04 15:04:26 +00:00
|
|
|
let quorum = strategy.rs_quorum.unwrap_or(to.len());
|
2021-10-14 09:50:12 +00:00
|
|
|
|
2022-02-17 22:28:23 +00:00
|
|
|
let tracer = opentelemetry::global::tracer("garage");
|
2023-11-28 10:12:39 +00:00
|
|
|
let span_name = format!(
|
|
|
|
"RPC [{}] try_call_many (quorum {}/{})",
|
|
|
|
endpoint.path(),
|
|
|
|
quorum,
|
|
|
|
to.len()
|
|
|
|
);
|
2023-11-14 14:40:46 +00:00
|
|
|
|
2022-02-24 12:18:51 +00:00
|
|
|
let mut span = tracer.start(span_name);
|
2022-02-24 13:59:49 +00:00
|
|
|
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
|
2022-02-17 22:28:23 +00:00
|
|
|
span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
|
|
|
|
span.set_attribute(KeyValue::new("quorum", quorum as i64));
|
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
self.try_call_many_inner(endpoint, to, msg, strategy, quorum)
|
2022-02-22 14:25:13 +00:00
|
|
|
.with_context(Context::current_with_span(span))
|
|
|
|
.await
|
|
|
|
}
|
2021-11-04 15:04:26 +00:00
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
async fn try_call_many_inner<M, N, H, S>(
|
2022-02-22 14:25:13 +00:00
|
|
|
&self,
|
|
|
|
endpoint: &Arc<Endpoint<M, H>>,
|
|
|
|
to: &[Uuid],
|
2022-07-22 13:20:00 +00:00
|
|
|
msg: N,
|
2022-02-22 14:25:13 +00:00
|
|
|
strategy: RequestStrategy,
|
|
|
|
quorum: usize,
|
|
|
|
) -> Result<Vec<S>, Error>
|
|
|
|
where
|
|
|
|
M: Rpc<Response = Result<S, Error>> + 'static,
|
2022-07-22 13:20:00 +00:00
|
|
|
N: IntoReq<M>,
|
2022-07-22 16:20:27 +00:00
|
|
|
H: StreamingEndpointHandler<M> + 'static,
|
2022-02-22 14:25:13 +00:00
|
|
|
S: Send + 'static,
|
|
|
|
{
|
2023-11-14 14:40:46 +00:00
|
|
|
// Once quorum is reached, other requests don't matter.
|
|
|
|
// What we do here is only send the required number of requests
|
|
|
|
// to reach a quorum, priorizing nodes with the lowest latency.
|
|
|
|
// When there are errors, we start new requests to compensate.
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// TODO: this could be made more aggressive, e.g. if after 2x the
|
|
|
|
// average ping of a given request, the response is not yet received,
|
|
|
|
// preemptively send an additional request to any remaining nodes.
|
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
// Reorder requests to priorize closeness / low latency
|
2023-11-27 10:52:57 +00:00
|
|
|
let request_order = self.request_order(to.iter().copied());
|
2023-11-14 14:40:46 +00:00
|
|
|
let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);
|
2022-02-22 14:25:13 +00:00
|
|
|
|
|
|
|
// Build future for each request
|
|
|
|
// They are not started now: they are added below in a FuturesUnordered
|
|
|
|
// object that will take care of polling them (see below)
|
2023-11-14 14:40:46 +00:00
|
|
|
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
|
|
|
let mut requests = request_order.into_iter().map(|to| {
|
2022-02-22 14:25:13 +00:00
|
|
|
let self2 = self.clone();
|
|
|
|
let msg = msg.clone();
|
|
|
|
let endpoint2 = endpoint.clone();
|
2023-11-28 10:12:39 +00:00
|
|
|
async move { self2.call(&endpoint2, to, msg, strategy).await }
|
2022-02-22 14:25:13 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
// Vectors in which success results and errors will be collected
|
|
|
|
let mut successes = vec![];
|
|
|
|
let mut errors = vec![];
|
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
// resp_stream will contain all of the requests that are currently in flight.
|
|
|
|
// (for the moment none, they will be added in the loop below)
|
|
|
|
let mut resp_stream = FuturesUnordered::new();
|
|
|
|
|
|
|
|
// Do some requests and collect results
|
|
|
|
while successes.len() < quorum {
|
|
|
|
// If the current set of requests that are running is not enough to possibly
|
|
|
|
// reach quorum, start some new requests.
|
|
|
|
while send_all_at_once || successes.len() + resp_stream.len() < quorum {
|
2023-11-28 10:12:39 +00:00
|
|
|
if let Some(fut) = requests.next() {
|
|
|
|
resp_stream.push(fut)
|
2023-11-14 14:40:46 +00:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2022-07-25 16:19:35 +00:00
|
|
|
}
|
2022-02-22 14:25:13 +00:00
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
if successes.len() + resp_stream.len() < quorum {
|
|
|
|
// We know we won't ever reach quorum
|
|
|
|
break;
|
|
|
|
}
|
2022-02-17 22:28:23 +00:00
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
// Wait for one request to terminate
|
2023-11-16 15:34:01 +00:00
|
|
|
match resp_stream.next().await.unwrap() {
|
2023-11-14 14:40:46 +00:00
|
|
|
Ok(msg) => {
|
|
|
|
successes.push(msg);
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
2023-11-14 14:40:46 +00:00
|
|
|
Err(e) => {
|
|
|
|
errors.push(e);
|
2022-02-17 22:28:23 +00:00
|
|
|
}
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
|
|
|
}
|
2022-02-22 14:25:13 +00:00
|
|
|
|
|
|
|
if successes.len() >= quorum {
|
|
|
|
Ok(successes)
|
|
|
|
} else {
|
|
|
|
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
|
2023-11-15 12:07:42 +00:00
|
|
|
Err(Error::Quorum(
|
|
|
|
quorum,
|
|
|
|
None,
|
|
|
|
successes.len(),
|
|
|
|
to.len(),
|
|
|
|
errors,
|
|
|
|
))
|
2022-02-22 14:25:13 +00:00
|
|
|
}
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|
2022-07-25 16:19:35 +00:00
|
|
|
|
2023-11-27 10:52:57 +00:00
|
|
|
pub fn request_order(&self, nodes: impl Iterator<Item = Uuid>) -> Vec<Uuid> {
|
2022-07-25 16:19:35 +00:00
|
|
|
// Retrieve some status variables that we will use to sort requests
|
|
|
|
let peer_list = self.0.fullmesh.get_peer_list();
|
2023-11-09 13:12:05 +00:00
|
|
|
let layout = self.0.layout.read().unwrap();
|
2023-11-08 16:49:06 +00:00
|
|
|
let our_zone = match layout.current().node_role(&self.0.our_node_id) {
|
2022-07-25 16:19:35 +00:00
|
|
|
Some(pc) => &pc.zone,
|
|
|
|
None => "",
|
|
|
|
};
|
|
|
|
|
|
|
|
// Augment requests with some information used to sort them.
|
|
|
|
// The tuples are as follows:
|
|
|
|
// (is another node?, is another zone?, latency, node ID, request future)
|
|
|
|
// We store all of these tuples in a vec that we can sort.
|
|
|
|
// By sorting this vec, we priorize ourself, then nodes in the same zone,
|
|
|
|
// and within a same zone we priorize nodes with the lowest latency.
|
|
|
|
let mut nodes = nodes
|
|
|
|
.map(|to| {
|
2023-11-27 10:52:57 +00:00
|
|
|
let peer_zone = match layout.current().node_role(&to) {
|
2022-07-25 16:19:35 +00:00
|
|
|
Some(pc) => &pc.zone,
|
|
|
|
None => "",
|
|
|
|
};
|
|
|
|
let peer_avg_ping = peer_list
|
|
|
|
.iter()
|
|
|
|
.find(|x| x.id.as_ref() == to.as_slice())
|
|
|
|
.and_then(|pi| pi.avg_ping)
|
2022-09-20 14:01:41 +00:00
|
|
|
.unwrap_or_else(|| Duration::from_secs(10));
|
2022-07-25 16:19:35 +00:00
|
|
|
(
|
2023-11-27 10:52:57 +00:00
|
|
|
to != self.0.our_node_id,
|
2022-07-25 16:19:35 +00:00
|
|
|
peer_zone != our_zone,
|
|
|
|
peer_avg_ping,
|
2023-11-27 10:52:57 +00:00
|
|
|
to,
|
2022-07-25 16:19:35 +00:00
|
|
|
)
|
|
|
|
})
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
|
|
|
// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
|
|
|
|
nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping));
|
|
|
|
|
|
|
|
nodes
|
|
|
|
.into_iter()
|
|
|
|
.map(|(_, _, _, to)| to)
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
}
|
2023-11-14 14:40:46 +00:00
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
/// Make a RPC call to multiple servers, returning either a Vec of responses,
|
|
|
|
/// or an error if quorum could not be reached due to too many errors
|
|
|
|
///
|
|
|
|
/// Contrary to try_call_many, this fuction is especially made for broadcast
|
|
|
|
/// write operations. In particular:
|
|
|
|
///
|
|
|
|
/// - The request are sent to all specified nodes as soon as `try_write_many_sets`
|
|
|
|
/// is invoked.
|
|
|
|
///
|
|
|
|
/// - When `try_write_many_sets` returns, all remaining requests that haven't
|
|
|
|
/// completed move to a background task so that they have a chance to
|
|
|
|
/// complete successfully if there are no failures.
|
|
|
|
///
|
|
|
|
/// In addition, the nodes to which requests should be sent are divided in
|
|
|
|
/// "quorum sets", and `try_write_many_sets` only returns once a quorum
|
|
|
|
/// has been validated in each set. This is used in the case of cluster layout
|
|
|
|
/// changes, where data has to be written both in the old layout and in the
|
|
|
|
/// new one as long as all nodes have not successfully tranisitionned and
|
|
|
|
/// moved all data to the new layout.
|
2023-11-14 14:40:46 +00:00
|
|
|
pub async fn try_write_many_sets<M, N, H, S>(
|
|
|
|
&self,
|
|
|
|
endpoint: &Arc<Endpoint<M, H>>,
|
|
|
|
to_sets: &[Vec<Uuid>],
|
|
|
|
msg: N,
|
|
|
|
strategy: RequestStrategy,
|
|
|
|
) -> Result<Vec<S>, Error>
|
|
|
|
where
|
|
|
|
M: Rpc<Response = Result<S, Error>> + 'static,
|
|
|
|
N: IntoReq<M>,
|
|
|
|
H: StreamingEndpointHandler<M> + 'static,
|
|
|
|
S: Send + 'static,
|
|
|
|
{
|
|
|
|
let quorum = strategy
|
|
|
|
.rs_quorum
|
2023-11-28 10:12:39 +00:00
|
|
|
.expect("internal error: missing quorum value in try_write_many_sets");
|
2023-11-14 14:40:46 +00:00
|
|
|
|
|
|
|
let tracer = opentelemetry::global::tracer("garage");
|
|
|
|
let span_name = format!(
|
2023-11-28 10:12:39 +00:00
|
|
|
"RPC [{}] try_write_many_sets (quorum {} in {} sets)",
|
2023-11-14 14:40:46 +00:00
|
|
|
endpoint.path(),
|
|
|
|
quorum,
|
|
|
|
to_sets.len()
|
|
|
|
);
|
|
|
|
|
|
|
|
let mut span = tracer.start(span_name);
|
|
|
|
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
|
|
|
|
span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets)));
|
|
|
|
span.set_attribute(KeyValue::new("quorum", quorum as i64));
|
|
|
|
|
|
|
|
self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum)
|
|
|
|
.with_context(Context::current_with_span(span))
|
|
|
|
.await
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn try_write_many_sets_inner<M, N, H, S>(
|
|
|
|
&self,
|
|
|
|
endpoint: &Arc<Endpoint<M, H>>,
|
|
|
|
to_sets: &[Vec<Uuid>],
|
|
|
|
msg: N,
|
|
|
|
strategy: RequestStrategy,
|
|
|
|
quorum: usize,
|
|
|
|
) -> Result<Vec<S>, Error>
|
|
|
|
where
|
|
|
|
M: Rpc<Response = Result<S, Error>> + 'static,
|
|
|
|
N: IntoReq<M>,
|
|
|
|
H: StreamingEndpointHandler<M> + 'static,
|
|
|
|
S: Send + 'static,
|
|
|
|
{
|
|
|
|
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// Peers may appear in many quorum sets. Here, build a list of peers,
|
|
|
|
// mapping to the index of the quorum sets in which they appear.
|
2023-11-14 14:40:46 +00:00
|
|
|
let mut peers = HashMap::<Uuid, Vec<usize>>::new();
|
|
|
|
for (i, set) in to_sets.iter().enumerate() {
|
|
|
|
for peer in set.iter() {
|
|
|
|
peers.entry(*peer).or_default().push(i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// Send one request to each peer of the quorum sets
|
2023-11-14 14:40:46 +00:00
|
|
|
let requests = peers.iter().map(|(peer, _)| {
|
|
|
|
let self2 = self.clone();
|
|
|
|
let msg = msg.clone();
|
|
|
|
let endpoint2 = endpoint.clone();
|
|
|
|
let to = *peer;
|
2023-11-28 10:12:39 +00:00
|
|
|
async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }
|
2023-11-14 14:40:46 +00:00
|
|
|
});
|
|
|
|
let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// Success and error responses will be collected in these two vectors
|
2023-11-14 14:40:46 +00:00
|
|
|
let mut successes = vec![];
|
|
|
|
let mut errors = vec![];
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// `set_counters` is used to keep track of how many success and error
|
|
|
|
// responses are received within each quorum set. When a node returns
|
|
|
|
// its response, it counts as a sucess/an error for all of the quorum
|
|
|
|
// sets which it is part of.
|
2023-11-14 14:40:46 +00:00
|
|
|
let mut set_counters = vec![(0, 0); to_sets.len()];
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// Drive requests to completion
|
2023-11-16 15:34:01 +00:00
|
|
|
while let Some((node, resp)) = resp_stream.next().await {
|
2023-11-28 10:12:39 +00:00
|
|
|
// Store the response in the correct vector and increment the
|
|
|
|
// appropriate counters
|
2023-11-14 14:40:46 +00:00
|
|
|
match resp {
|
|
|
|
Ok(msg) => {
|
|
|
|
for set in peers.get(&node).unwrap().iter() {
|
|
|
|
set_counters[*set].0 += 1;
|
|
|
|
}
|
|
|
|
successes.push(msg);
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
for set in peers.get(&node).unwrap().iter() {
|
|
|
|
set_counters[*set].1 += 1;
|
|
|
|
}
|
|
|
|
errors.push(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// If we have a quorum of ok in all quorum sets, then it's a success!
|
2023-11-16 15:34:01 +00:00
|
|
|
if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) {
|
2023-11-14 14:40:46 +00:00
|
|
|
// Continue all other requets in background
|
|
|
|
tokio::spawn(async move {
|
2023-11-16 15:34:01 +00:00
|
|
|
resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
|
2023-11-14 14:40:46 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
return Ok(successes);
|
|
|
|
}
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// If there is a quorum set for which too many errors were received,
|
|
|
|
// we know it's impossible to get a quorum, so return immediately.
|
2023-11-14 14:40:46 +00:00
|
|
|
if set_counters
|
|
|
|
.iter()
|
|
|
|
.enumerate()
|
2023-11-16 15:34:01 +00:00
|
|
|
.any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len())
|
2023-11-14 14:40:46 +00:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-28 10:12:39 +00:00
|
|
|
// At this point, there is no quorum and we know that a quorum
|
|
|
|
// will never be achieved. Currently, we drop all remaining requests.
|
|
|
|
// Should we still move them to background so that they can continue
|
|
|
|
// for non-failed nodes? Not doing so has no impact on correctness,
|
|
|
|
// but it means that more cancellation messages will be sent. Idk.
|
|
|
|
// (When an in-progress request future is dropped, Netapp automatically
|
|
|
|
// sends a cancellation message to the remote node to inform it that
|
|
|
|
// the result is no longer needed. In turn, if the remote node receives
|
|
|
|
// the cancellation message in time, it interrupts the task of the
|
|
|
|
// running request handler.)
|
|
|
|
|
2023-11-14 14:40:46 +00:00
|
|
|
// Failure, could not get quorum
|
|
|
|
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
|
2023-11-15 12:07:42 +00:00
|
|
|
Err(Error::Quorum(
|
|
|
|
quorum,
|
|
|
|
Some(to_sets.len()),
|
|
|
|
successes.len(),
|
|
|
|
peers.len(),
|
|
|
|
errors,
|
|
|
|
))
|
2023-11-14 14:40:46 +00:00
|
|
|
}
|
2021-10-14 09:50:12 +00:00
|
|
|
}
|