Request strategy: don't launch all 3 requests if not needed

This commit is contained in:
Alex 2021-11-04 16:04:26 +01:00
parent 2090a6187f
commit e8811f7c9d
No known key found for this signature in database
GPG key ID: EDABF9711E244EB1
2 changed files with 156 additions and 49 deletions

View file

@ -7,7 +7,7 @@ use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt; use futures::stream::StreamExt;
use futures_util::future::FutureExt; use futures_util::future::FutureExt;
use tokio::select; use tokio::select;
use tokio::sync::Semaphore; use tokio::sync::{watch, Semaphore};
pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc}; pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc};
use netapp::peering::fullmesh::FullMeshPeeringStrategy; use netapp::peering::fullmesh::FullMeshPeeringStrategy;
@ -18,6 +18,8 @@ use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use crate::ring::Ring;
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
// Try to never have more than 200MB of outgoing requests // Try to never have more than 200MB of outgoing requests
@ -67,22 +69,30 @@ impl RequestStrategy {
} }
#[derive(Clone)] #[derive(Clone)]
pub struct RpcHelper { pub struct RpcHelper(Arc<RpcHelperInner>);
pub(crate) fullmesh: Arc<FullMeshPeeringStrategy>,
pub(crate) background: Arc<BackgroundRunner>, struct RpcHelperInner {
request_buffer_semaphore: Arc<Semaphore>, our_node_id: Uuid,
fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>,
request_buffer_semaphore: Semaphore,
} }
impl RpcHelper { impl RpcHelper {
pub(crate) fn new( pub(crate) fn new(
our_node_id: Uuid,
fullmesh: Arc<FullMeshPeeringStrategy>, fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>, background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>,
) -> Self { ) -> Self {
Self { Self(Arc::new(RpcHelperInner {
our_node_id,
fullmesh, fullmesh,
background, background,
request_buffer_semaphore: Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE)), ring,
} request_buffer_semaphore: Semaphore::new(REQUEST_BUFFER_SIZE),
}))
} }
pub async fn call<M, H, S>( pub async fn call<M, H, S>(
@ -111,7 +121,11 @@ impl RpcHelper {
H: EndpointHandler<M>, H: EndpointHandler<M>,
{ {
let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32; let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32;
let permit = self.request_buffer_semaphore.acquire_many(msg_size).await?; let permit = self
.0
.request_buffer_semaphore
.acquire_many(msg_size)
.await?;
let node_id = to.into(); let node_id = to.into();
select! { select! {
@ -160,6 +174,7 @@ impl RpcHelper {
H: EndpointHandler<M>, H: EndpointHandler<M>,
{ {
let to = self let to = self
.0
.fullmesh .fullmesh
.get_peer_list() .get_peer_list()
.iter() .iter()
@ -168,8 +183,8 @@ impl RpcHelper {
self.call_many(endpoint, &to[..], msg, strat).await self.call_many(endpoint, &to[..], msg, strat).await
} }
/// Make a RPC call to multiple servers, returning either a Vec of responses, or an error if /// Make a RPC call to multiple servers, returning either a Vec of responses,
/// strategy could not be respected due to too many errors /// or an error if quorum could not be reached due to too many errors
pub async fn try_call_many<M, H, S>( pub async fn try_call_many<M, H, S>(
&self, &self,
endpoint: &Arc<Endpoint<M, H>>, endpoint: &Arc<Endpoint<M, H>>,
@ -183,26 +198,117 @@ impl RpcHelper {
S: Send, S: Send,
{ {
let msg = Arc::new(msg); let msg = Arc::new(msg);
let mut resp_stream = to
.to_vec() // Build future for each request
.into_iter() // They are not started now: they are added below in a FuturesUnordered
.map(|to| { // object that will take care of polling them (see below)
let requests = to.iter().cloned().map(|to| {
let self2 = self.clone(); let self2 = self.clone();
let msg = msg.clone(); let msg = msg.clone();
let endpoint2 = endpoint.clone(); let endpoint2 = endpoint.clone();
async move { self2.call_arc(&endpoint2, to, msg, strategy).await } (to, async move {
self2.call_arc(&endpoint2, to, msg, strategy).await
}) })
.collect::<FuturesUnordered<_>>(); });
let mut results = vec![];
let mut errors = vec![];
let quorum = strategy.rs_quorum.unwrap_or(to.len()); let quorum = strategy.rs_quorum.unwrap_or(to.len());
// Vectors in which success results and errors will be collected
let mut successes = vec![];
let mut errors = vec![];
if strategy.rs_interrupt_after_quorum {
// Case 1: once quorum is reached, other requests don't matter.
// What we do here is only send the required number of requests
// to reach a quorum, priorizing nodes with the lowest latency.
// When there are errors, we start new requests to compensate.
// Retrieve some status variables that we will use to sort requests
let peer_list = self.0.fullmesh.get_peer_list();
let ring: Arc<Ring> = self.0.ring.borrow().clone();
let our_zone = match ring.config.members.get(&self.0.our_node_id) {
Some(pc) => &pc.zone,
None => "",
};
// Augment requests with some information used to sort them.
// The tuples are as follows:
// (is another node?, is another zone?, latency, node ID, request future)
// We store all of these tuples in a vec that we can sort.
// By sorting this vec, we priorize ourself, then nodes in the same zone,
// and within a same zone we priorize nodes with the lowest latency.
let mut requests = requests
.map(|(to, fut)| {
let peer_zone = match ring.config.members.get(&to) {
Some(pc) => &pc.zone,
None => "",
};
let peer_avg_ping = peer_list
.iter()
.find(|x| x.id.as_ref() == to.as_slice())
.map(|pi| pi.avg_ping)
.flatten()
.unwrap_or_else(|| Duration::from_secs(1));
(
to != self.0.our_node_id,
peer_zone != our_zone,
peer_avg_ping,
to,
fut,
)
})
.collect::<Vec<_>>();
// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
requests
.sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| (*diffnode, *diffzone, *ping));
// Make an iterator to take requests in their sorted order
let mut requests = requests.into_iter();
// resp_stream will contain all of the requests that are currently in flight.
// (for the moment none, they will be added in the loop below)
let mut resp_stream = FuturesUnordered::new();
// Do some requests and collect results
'request_loop: while successes.len() < quorum {
// If the current set of requests that are running is not enough to possibly
// reach quorum, start some new requests.
while successes.len() + resp_stream.len() < quorum {
if let Some((_, _, _, _to, fut)) = requests.next() {
resp_stream.push(fut);
} else {
// If we have no request to add, we know that we won't ever
// reach quorum: bail out now.
break 'request_loop;
}
}
assert!(!resp_stream.is_empty()); // because of loop invariants
// Wait for one request to terminate
match resp_stream.next().await.unwrap() {
Ok(msg) => {
successes.push(msg);
}
Err(e) => {
errors.push(e);
}
}
}
} else {
// Case 2: all of the requests need to be sent in all cases,
// and need to terminate. (this is the case for writes that
// must be spread to n nodes)
// Just start all the requests in parallel and return as soon
// as the quorum is reached.
let mut resp_stream = requests
.map(|(_, fut)| fut)
.collect::<FuturesUnordered<_>>();
while let Some(resp) = resp_stream.next().await { while let Some(resp) = resp_stream.next().await {
match resp { match resp {
Ok(msg) => { Ok(msg) => {
results.push(msg); successes.push(msg);
if results.len() >= quorum { if successes.len() >= quorum {
break; break;
} }
} }
@ -212,25 +318,26 @@ impl RpcHelper {
} }
} }
if results.len() >= quorum { if !resp_stream.is_empty() {
// Continue requests in background. // Continue remaining requests in background.
// Continue the remaining requests immediately using tokio::spawn // Continue the remaining requests immediately using tokio::spawn
// but enqueue a task in the background runner // but enqueue a task in the background runner
// to ensure that the process won't exit until the requests are done // to ensure that the process won't exit until the requests are done
// (if we had just enqueued the resp_stream.collect directly in the background runner, // (if we had just enqueued the resp_stream.collect directly in the background runner,
// the requests might have been put on hold in the background runner's queue, // the requests might have been put on hold in the background runner's queue,
// in which case they might timeout or otherwise fail) // in which case they might timeout or otherwise fail)
if !strategy.rs_interrupt_after_quorum {
let wait_finished_fut = tokio::spawn(async move { let wait_finished_fut = tokio::spawn(async move {
resp_stream.collect::<Vec<_>>().await; resp_stream.collect::<Vec<Result<_, _>>>().await;
}); });
self.background.spawn(wait_finished_fut.map(|_| Ok(()))); self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
}
} }
Ok(results) if successes.len() >= quorum {
Ok(successes)
} else { } else {
let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
Err(Error::Quorum(quorum, results.len(), to.len(), errors)) Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
} }
} }
} }

View file

@ -235,7 +235,7 @@ impl System {
node_status: RwLock::new(HashMap::new()), node_status: RwLock::new(HashMap::new()),
netapp: netapp.clone(), netapp: netapp.clone(),
fullmesh: fullmesh.clone(), fullmesh: fullmesh.clone(),
rpc: RpcHelper::new(fullmesh, background.clone()), rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()),
system_endpoint, system_endpoint,
replication_factor, replication_factor,
rpc_listen_addr: config.rpc_bind_addr, rpc_listen_addr: config.rpc_bind_addr,