Garage v1.0 #683
1 changed files with 88 additions and 17 deletions
|
@ -129,6 +129,12 @@ impl RpcHelper {
|
||||||
N: IntoReq<M> + Send,
|
N: IntoReq<M> + Send,
|
||||||
H: StreamingEndpointHandler<M>,
|
H: StreamingEndpointHandler<M>,
|
||||||
{
|
{
|
||||||
|
let tracer = opentelemetry::global::tracer("garage");
|
||||||
|
let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to);
|
||||||
|
let mut span = tracer.start(span_name);
|
||||||
|
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
|
||||||
|
span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
|
||||||
|
|
||||||
let metric_tags = [
|
let metric_tags = [
|
||||||
KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
|
KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
|
||||||
KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
|
KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
|
||||||
|
@ -140,6 +146,7 @@ impl RpcHelper {
|
||||||
let node_id = to.into();
|
let node_id = to.into();
|
||||||
let rpc_call = endpoint
|
let rpc_call = endpoint
|
||||||
.call_streaming(&node_id, msg, strat.rs_priority)
|
.call_streaming(&node_id, msg, strat.rs_priority)
|
||||||
|
.with_context(Context::current_with_span(span))
|
||||||
.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
|
.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
|
||||||
|
|
||||||
let timeout = async {
|
let timeout = async {
|
||||||
|
@ -182,12 +189,17 @@ impl RpcHelper {
|
||||||
N: IntoReq<M>,
|
N: IntoReq<M>,
|
||||||
H: StreamingEndpointHandler<M>,
|
H: StreamingEndpointHandler<M>,
|
||||||
{
|
{
|
||||||
|
let tracer = opentelemetry::global::tracer("garage");
|
||||||
|
let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len());
|
||||||
|
let span = tracer.start(span_name);
|
||||||
|
|
||||||
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
||||||
|
|
||||||
let resps = join_all(
|
let resps = join_all(
|
||||||
to.iter()
|
to.iter()
|
||||||
.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
|
.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
|
||||||
)
|
)
|
||||||
|
.with_context(Context::current_with_span(span))
|
||||||
.await;
|
.await;
|
||||||
Ok(to
|
Ok(to
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -219,6 +231,22 @@ impl RpcHelper {
|
||||||
|
|
||||||
/// Make a RPC call to multiple servers, returning either a Vec of responses,
|
/// Make a RPC call to multiple servers, returning either a Vec of responses,
|
||||||
/// or an error if quorum could not be reached due to too many errors
|
/// or an error if quorum could not be reached due to too many errors
|
||||||
|
///
|
||||||
|
/// If RequestStrategy has send_all_at_once set, then all requests will be
|
||||||
|
/// sent at once, and `try_call_many` will return as soon as a quorum of
|
||||||
|
/// responses is achieved, dropping and cancelling the remaining requests.
|
||||||
|
///
|
||||||
|
/// Otherwise, `quorum` requests will be sent at the same time, and if an
|
||||||
|
/// error response is received, a new request will be sent to replace it.
|
||||||
|
/// The ordering of nodes to which requests are sent is determined by
|
||||||
|
/// the `RpcHelper::request_order` function, which takes into account
|
||||||
|
/// parameters such as node zones and measured ping values.
|
||||||
|
///
|
||||||
|
/// In both cases, the basic contract of this function is that even in the
|
||||||
|
/// absence of failures, the RPC call might not be driven to completion
|
||||||
|
/// on all of the specified nodes. It is therefore unfit for broadcast
|
||||||
|
/// write operations where we expect all nodes to successfully store
|
||||||
|
/// the written date.
|
||||||
pub async fn try_call_many<M, N, H, S>(
|
pub async fn try_call_many<M, N, H, S>(
|
||||||
&self,
|
&self,
|
||||||
endpoint: &Arc<Endpoint<M, H>>,
|
endpoint: &Arc<Endpoint<M, H>>,
|
||||||
|
@ -235,7 +263,12 @@ impl RpcHelper {
|
||||||
let quorum = strategy.rs_quorum.unwrap_or(to.len());
|
let quorum = strategy.rs_quorum.unwrap_or(to.len());
|
||||||
|
|
||||||
let tracer = opentelemetry::global::tracer("garage");
|
let tracer = opentelemetry::global::tracer("garage");
|
||||||
let span_name = format!("Read RPC {} to {} of {}", endpoint.path(), quorum, to.len());
|
let span_name = format!(
|
||||||
|
"RPC [{}] try_call_many (quorum {}/{})",
|
||||||
|
endpoint.path(),
|
||||||
|
quorum,
|
||||||
|
to.len()
|
||||||
|
);
|
||||||
|
|
||||||
let mut span = tracer.start(span_name);
|
let mut span = tracer.start(span_name);
|
||||||
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
|
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
|
||||||
|
@ -266,6 +299,10 @@ impl RpcHelper {
|
||||||
// to reach a quorum, priorizing nodes with the lowest latency.
|
// to reach a quorum, priorizing nodes with the lowest latency.
|
||||||
// When there are errors, we start new requests to compensate.
|
// When there are errors, we start new requests to compensate.
|
||||||
|
|
||||||
|
// TODO: this could be made more aggressive, e.g. if after 2x the
|
||||||
|
// average ping of a given request, the response is not yet received,
|
||||||
|
// preemptively send an additional request to any remaining nodes.
|
||||||
|
|
||||||
// Reorder requests to priorize closeness / low latency
|
// Reorder requests to priorize closeness / low latency
|
||||||
let request_order = self.request_order(to.iter().copied());
|
let request_order = self.request_order(to.iter().copied());
|
||||||
let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);
|
let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);
|
||||||
|
@ -278,9 +315,7 @@ impl RpcHelper {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
let msg = msg.clone();
|
let msg = msg.clone();
|
||||||
let endpoint2 = endpoint.clone();
|
let endpoint2 = endpoint.clone();
|
||||||
(to, async move {
|
async move { self2.call(&endpoint2, to, msg, strategy).await }
|
||||||
self2.call(&endpoint2, to, msg, strategy).await
|
|
||||||
})
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Vectors in which success results and errors will be collected
|
// Vectors in which success results and errors will be collected
|
||||||
|
@ -296,10 +331,8 @@ impl RpcHelper {
|
||||||
// If the current set of requests that are running is not enough to possibly
|
// If the current set of requests that are running is not enough to possibly
|
||||||
// reach quorum, start some new requests.
|
// reach quorum, start some new requests.
|
||||||
while send_all_at_once || successes.len() + resp_stream.len() < quorum {
|
while send_all_at_once || successes.len() + resp_stream.len() < quorum {
|
||||||
if let Some((req_to, fut)) = requests.next() {
|
if let Some(fut) = requests.next() {
|
||||||
let tracer = opentelemetry::global::tracer("garage");
|
resp_stream.push(fut)
|
||||||
let span = tracer.start(format!("RPC to {:?}", req_to));
|
|
||||||
resp_stream.push(fut.with_context(Context::current_with_span(span)));
|
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -379,6 +412,25 @@ impl RpcHelper {
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Make a RPC call to multiple servers, returning either a Vec of responses,
|
||||||
|
/// or an error if quorum could not be reached due to too many errors
|
||||||
|
///
|
||||||
|
/// Contrary to try_call_many, this fuction is especially made for broadcast
|
||||||
|
/// write operations. In particular:
|
||||||
|
///
|
||||||
|
/// - The request are sent to all specified nodes as soon as `try_write_many_sets`
|
||||||
|
/// is invoked.
|
||||||
|
///
|
||||||
|
/// - When `try_write_many_sets` returns, all remaining requests that haven't
|
||||||
|
/// completed move to a background task so that they have a chance to
|
||||||
|
/// complete successfully if there are no failures.
|
||||||
|
///
|
||||||
|
/// In addition, the nodes to which requests should be sent are divided in
|
||||||
|
/// "quorum sets", and `try_write_many_sets` only returns once a quorum
|
||||||
|
/// has been validated in each set. This is used in the case of cluster layout
|
||||||
|
/// changes, where data has to be written both in the old layout and in the
|
||||||
|
/// new one as long as all nodes have not successfully tranisitionned and
|
||||||
|
/// moved all data to the new layout.
|
||||||
pub async fn try_write_many_sets<M, N, H, S>(
|
pub async fn try_write_many_sets<M, N, H, S>(
|
||||||
&self,
|
&self,
|
||||||
endpoint: &Arc<Endpoint<M, H>>,
|
endpoint: &Arc<Endpoint<M, H>>,
|
||||||
|
@ -394,11 +446,11 @@ impl RpcHelper {
|
||||||
{
|
{
|
||||||
let quorum = strategy
|
let quorum = strategy
|
||||||
.rs_quorum
|
.rs_quorum
|
||||||
.expect("internal error: missing quroum in try_write_many_sets");
|
.expect("internal error: missing quorum value in try_write_many_sets");
|
||||||
|
|
||||||
let tracer = opentelemetry::global::tracer("garage");
|
let tracer = opentelemetry::global::tracer("garage");
|
||||||
let span_name = format!(
|
let span_name = format!(
|
||||||
"Write RPC {} (quorum {} in {} sets)",
|
"RPC [{}] try_write_many_sets (quorum {} in {} sets)",
|
||||||
endpoint.path(),
|
endpoint.path(),
|
||||||
quorum,
|
quorum,
|
||||||
to_sets.len()
|
to_sets.len()
|
||||||
|
@ -430,6 +482,8 @@ impl RpcHelper {
|
||||||
{
|
{
|
||||||
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
let msg = msg.into_req().map_err(netapp::error::Error::from)?;
|
||||||
|
|
||||||
|
// Peers may appear in many quorum sets. Here, build a list of peers,
|
||||||
|
// mapping to the index of the quorum sets in which they appear.
|
||||||
let mut peers = HashMap::<Uuid, Vec<usize>>::new();
|
let mut peers = HashMap::<Uuid, Vec<usize>>::new();
|
||||||
for (i, set) in to_sets.iter().enumerate() {
|
for (i, set) in to_sets.iter().enumerate() {
|
||||||
for peer in set.iter() {
|
for peer in set.iter() {
|
||||||
|
@ -437,24 +491,30 @@ impl RpcHelper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Send one request to each peer of the quorum sets
|
||||||
let requests = peers.iter().map(|(peer, _)| {
|
let requests = peers.iter().map(|(peer, _)| {
|
||||||
let self2 = self.clone();
|
let self2 = self.clone();
|
||||||
let msg = msg.clone();
|
let msg = msg.clone();
|
||||||
let endpoint2 = endpoint.clone();
|
let endpoint2 = endpoint.clone();
|
||||||
let to = *peer;
|
let to = *peer;
|
||||||
let tracer = opentelemetry::global::tracer("garage");
|
async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }
|
||||||
let span = tracer.start(format!("RPC to {:?}", to));
|
|
||||||
let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) };
|
|
||||||
fut.with_context(Context::current_with_span(span))
|
|
||||||
});
|
});
|
||||||
let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
|
let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
|
||||||
|
|
||||||
|
// Success and error responses will be collected in these two vectors
|
||||||
let mut successes = vec![];
|
let mut successes = vec![];
|
||||||
let mut errors = vec![];
|
let mut errors = vec![];
|
||||||
|
|
||||||
|
// `set_counters` is used to keep track of how many success and error
|
||||||
|
// responses are received within each quorum set. When a node returns
|
||||||
|
// its response, it counts as a sucess/an error for all of the quorum
|
||||||
|
// sets which it is part of.
|
||||||
let mut set_counters = vec![(0, 0); to_sets.len()];
|
let mut set_counters = vec![(0, 0); to_sets.len()];
|
||||||
|
|
||||||
|
// Drive requests to completion
|
||||||
while let Some((node, resp)) = resp_stream.next().await {
|
while let Some((node, resp)) = resp_stream.next().await {
|
||||||
|
// Store the response in the correct vector and increment the
|
||||||
|
// appropriate counters
|
||||||
match resp {
|
match resp {
|
||||||
Ok(msg) => {
|
Ok(msg) => {
|
||||||
for set in peers.get(&node).unwrap().iter() {
|
for set in peers.get(&node).unwrap().iter() {
|
||||||
|
@ -470,9 +530,8 @@ impl RpcHelper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we have a quorum of ok in all quorum sets, then it's a success!
|
||||||
if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) {
|
if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) {
|
||||||
// Success
|
|
||||||
|
|
||||||
// Continue all other requets in background
|
// Continue all other requets in background
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
|
resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
|
||||||
|
@ -481,16 +540,28 @@ impl RpcHelper {
|
||||||
return Ok(successes);
|
return Ok(successes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If there is a quorum set for which too many errors were received,
|
||||||
|
// we know it's impossible to get a quorum, so return immediately.
|
||||||
if set_counters
|
if set_counters
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len())
|
.any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len())
|
||||||
{
|
{
|
||||||
// Too many errors in this set, we know we won't get a quorum
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// At this point, there is no quorum and we know that a quorum
|
||||||
|
// will never be achieved. Currently, we drop all remaining requests.
|
||||||
|
// Should we still move them to background so that they can continue
|
||||||
|
// for non-failed nodes? Not doing so has no impact on correctness,
|
||||||
|
// but it means that more cancellation messages will be sent. Idk.
|
||||||
|
// (When an in-progress request future is dropped, Netapp automatically
|
||||||
|
// sends a cancellation message to the remote node to inform it that
|
||||||
|
// the result is no longer needed. In turn, if the remote node receives
|
||||||
|
// the cancellation message in time, it interrupts the task of the
|
||||||
|
// running request handler.)
|
||||||
|
|
||||||
// Failure, could not get quorum
|
// Failure, could not get quorum
|
||||||
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
|
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
|
||||||
Err(Error::Quorum(
|
Err(Error::Quorum(
|
||||||
|
|
Loading…
Reference in a new issue