garage/src/rpc/rpc_helper.rs

//! Contain structs related to making RPCs
use std::sync::Arc;
use std::time::Duration;

use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt;
use futures_util::future::FutureExt;
use tokio::select;
use tokio::sync::{watch, Semaphore};

use opentelemetry::KeyValue;
use opentelemetry::{
	trace::{FutureExt as OtelFutureExt, Span, TraceContextExt, Tracer},
	Context,
};

pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler};
use netapp::message::IntoReq;
pub use netapp::message::{
	Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL,
	PRIO_SECONDARY,
};
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
pub use netapp::{self, NetApp, NodeID};

use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_util::error::Error;
use garage_util::metrics::RecordDuration;

use crate::metrics::RpcMetrics;
use crate::ring::Ring;

const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);

// Don't allow more than 100 concurrent outgoing RPCs.
const MAX_CONCURRENT_REQUESTS: usize = 100;

/// Strategy to apply when making RPC
#[derive(Copy, Clone)]
pub struct RequestStrategy {
	/// Max time to wait for reponse
	pub rs_timeout: Duration,
	/// Min number of response to consider the request successful
	pub rs_quorum: Option<usize>,
	/// Should requests be dropped after enough response are received
	pub rs_interrupt_after_quorum: bool,
	/// Request priority
	pub rs_priority: RequestPriority,
}

impl RequestStrategy {
	/// Create a RequestStrategy with default timeout and not interrupting when quorum reached
	pub fn with_priority(prio: RequestPriority) -> Self {
		RequestStrategy {
			rs_timeout: DEFAULT_TIMEOUT,
			rs_quorum: None,
			rs_interrupt_after_quorum: false,
			rs_priority: prio,
		}
	}
	/// Set quorum to be reached for request
	pub fn with_quorum(mut self, quorum: usize) -> Self {
		self.rs_quorum = Some(quorum);
		self
	}
	/// Set timeout of the strategy
	pub fn with_timeout(mut self, timeout: Duration) -> Self {
		self.rs_timeout = timeout;
		self
	}
	/// Set if requests can be dropped after quorum has been reached
	/// In general true for read requests, and false for write
	pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self {
		self.rs_interrupt_after_quorum = interrupt;
		self
	}
}

#[derive(Clone)]
pub struct RpcHelper(Arc<RpcHelperInner>);

struct RpcHelperInner {
	our_node_id: Uuid,
	fullmesh: Arc<FullMeshPeeringStrategy>,
	background: Arc<BackgroundRunner>,
	ring: watch::Receiver<Arc<Ring>>,
	request_buffer_semaphore: Arc<Semaphore>,
	metrics: RpcMetrics,
}

impl RpcHelper {
	pub(crate) fn new(
		our_node_id: Uuid,
		fullmesh: Arc<FullMeshPeeringStrategy>,
		background: Arc<BackgroundRunner>,
		ring: watch::Receiver<Arc<Ring>>,
	) -> Self {
		let sem = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS));

		let metrics = RpcMetrics::new(sem.clone());

		Self(Arc::new(RpcHelperInner {
			our_node_id,
			fullmesh,
			background,
			ring,
			request_buffer_semaphore: sem,
			metrics,
		}))
	}

	pub async fn call<M, N, H, S>(
		&self,
		endpoint: &Endpoint<M, H>,
		to: Uuid,
		msg: N,
		strat: RequestStrategy,
	) -> Result<S, Error>
	where
		M: Rpc<Response = Result<S, Error>>,
		N: IntoReq<M> + Send,
		H: StreamingEndpointHandler<M>,
	{
		let metric_tags = [
			KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
			KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
			KeyValue::new("to", format!("{:?}", to)),
		];

		let permit = self
			.0
			.request_buffer_semaphore
			.acquire()
			.record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags)
			.await?;

		self.0.metrics.rpc_counter.add(1, &metric_tags);

		let node_id = to.into();
		let rpc_call = endpoint
			.call_streaming(&node_id, msg, strat.rs_priority)
			.record_duration(&self.0.metrics.rpc_duration, &metric_tags);

		select! {
			res = rpc_call => {
				drop(permit);

				if res.is_err() {
					self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
				}
				let res = res?.into_msg();

				if res.is_err() {
					self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
				}

				Ok(res?)
			}
			_ = tokio::time::sleep(strat.rs_timeout) => {
				drop(permit);
				self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
				Err(Error::Timeout)
			}
		}
	}

	pub async fn call_many<M, N, H, S>(
		&self,
		endpoint: &Endpoint<M, H>,
		to: &[Uuid],
		msg: N,
		strat: RequestStrategy,
	) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
	where
		M: Rpc<Response = Result<S, Error>>,
		N: IntoReq<M>,
		H: StreamingEndpointHandler<M>,
	{
		let msg = msg.into_req().map_err(netapp::error::Error::from)?;

		let resps = join_all(
			to.iter()
				.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
		)
		.await;
		Ok(to
			.iter()
			.cloned()
			.zip(resps.into_iter())
			.collect::<Vec<_>>())
	}

	pub async fn broadcast<M, N, H, S>(
		&self,
		endpoint: &Endpoint<M, H>,
		msg: N,
		strat: RequestStrategy,
	) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
	where
		M: Rpc<Response = Result<S, Error>>,
		N: IntoReq<M>,
		H: StreamingEndpointHandler<M>,
	{
		let to = self
			.0
			.fullmesh
			.get_peer_list()
			.iter()
			.map(|p| p.id.into())
			.collect::<Vec<_>>();
		self.call_many(endpoint, &to[..], msg, strat).await
	}

	/// Make a RPC call to multiple servers, returning either a Vec of responses,
	/// or an error if quorum could not be reached due to too many errors
	pub async fn try_call_many<M, N, H, S>(
		&self,
		endpoint: &Arc<Endpoint<M, H>>,
		to: &[Uuid],
		msg: N,
		strategy: RequestStrategy,
	) -> Result<Vec<S>, Error>
	where
		M: Rpc<Response = Result<S, Error>> + 'static,
		N: IntoReq<M>,
		H: StreamingEndpointHandler<M> + 'static,
		S: Send + 'static,
	{
		let quorum = strategy.rs_quorum.unwrap_or(to.len());

		let tracer = opentelemetry::global::tracer("garage");
		let span_name = if strategy.rs_interrupt_after_quorum {
			format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len())
		} else {
			format!(
				"RPC {} to {} (quorum {})",
				endpoint.path(),
				to.len(),
				quorum
			)
		};
		let mut span = tracer.start(span_name);
		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
		span.set_attribute(KeyValue::new("quorum", quorum as i64));
		span.set_attribute(KeyValue::new(
			"interrupt_after_quorum",
			strategy.rs_interrupt_after_quorum.to_string(),
		));

		self.try_call_many_internal(endpoint, to, msg, strategy, quorum)
			.with_context(Context::current_with_span(span))
			.await
	}

	async fn try_call_many_internal<M, N, H, S>(
		&self,
		endpoint: &Arc<Endpoint<M, H>>,
		to: &[Uuid],
		msg: N,
		strategy: RequestStrategy,
		quorum: usize,
	) -> Result<Vec<S>, Error>
	where
		M: Rpc<Response = Result<S, Error>> + 'static,
		N: IntoReq<M>,
		H: StreamingEndpointHandler<M> + 'static,
		S: Send + 'static,
	{
		let msg = msg.into_req().map_err(netapp::error::Error::from)?;

		// Build future for each request
		// They are not started now: they are added below in a FuturesUnordered
		// object that will take care of polling them (see below)
		let requests = to.iter().cloned().map(|to| {
			let self2 = self.clone();
			let msg = msg.clone();
			let endpoint2 = endpoint.clone();
			(to, async move {
				self2.call(&endpoint2, to, msg, strategy).await
			})
		});

		// Vectors in which success results and errors will be collected
		let mut successes = vec![];
		let mut errors = vec![];

		if strategy.rs_interrupt_after_quorum {
			// Case 1: once quorum is reached, other requests don't matter.
			// What we do here is only send the required number of requests
			// to reach a quorum, priorizing nodes with the lowest latency.
			// When there are errors, we start new requests to compensate.

			// Reorder requests to priorize closeness / low latency
			let request_order = self.request_order(to);
			let mut ord_requests = vec![(); request_order.len()]
				.into_iter()
				.map(|_| None)
				.collect::<Vec<_>>();
			for (to, fut) in requests {
				let i = request_order.iter().position(|x| *x == to).unwrap();
				ord_requests[i] = Some((to, fut));
			}

			// Make an iterator to take requests in their sorted order
			let mut requests = ord_requests.into_iter().map(Option::unwrap);

			// resp_stream will contain all of the requests that are currently in flight.
			// (for the moment none, they will be added in the loop below)
			let mut resp_stream = FuturesUnordered::new();

			// Do some requests and collect results
			'request_loop: while successes.len() < quorum {
				// If the current set of requests that are running is not enough to possibly
				// reach quorum, start some new requests.
				while successes.len() + resp_stream.len() < quorum {
					if let Some((req_to, fut)) = requests.next() {
						let tracer = opentelemetry::global::tracer("garage");
						let span = tracer.start(format!("RPC to {:?}", req_to));
						resp_stream.push(tokio::spawn(
							fut.with_context(Context::current_with_span(span)),
						));
					} else {
						// If we have no request to add, we know that we won't ever
						// reach quorum: bail out now.
						break 'request_loop;
					}
				}
				assert!(!resp_stream.is_empty()); // because of loop invariants

				// Wait for one request to terminate
				match resp_stream.next().await.unwrap().unwrap() {
					Ok(msg) => {
						successes.push(msg);
					}
					Err(e) => {
						errors.push(e);
					}
				}
			}
		} else {
			// Case 2: all of the requests need to be sent in all cases,
			// and need to terminate. (this is the case for writes that
			// must be spread to n nodes)
			// Just start all the requests in parallel and return as soon
			// as the quorum is reached.
			let mut resp_stream = requests
				.map(|(_, fut)| fut)
				.collect::<FuturesUnordered<_>>();

			while let Some(resp) = resp_stream.next().await {
				match resp {
					Ok(msg) => {
						successes.push(msg);
						if successes.len() >= quorum {
							break;
						}
					}
					Err(e) => {
						errors.push(e);
					}
				}
			}

			if !resp_stream.is_empty() {
				// Continue remaining requests in background.
				// Continue the remaining requests immediately using tokio::spawn
				// but enqueue a task in the background runner
				// to ensure that the process won't exit until the requests are done
				// (if we had just enqueued the resp_stream.collect directly in the background runner,
				// the requests might have been put on hold in the background runner's queue,
				// in which case they might timeout or otherwise fail)
				let wait_finished_fut = tokio::spawn(async move {
					resp_stream.collect::<Vec<Result<_, _>>>().await;
				});
				self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
			}
		}

		if successes.len() >= quorum {
			Ok(successes)
		} else {
			let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
			Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
		}
	}

	pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> {
		// Retrieve some status variables that we will use to sort requests
		let peer_list = self.0.fullmesh.get_peer_list();
		let ring: Arc<Ring> = self.0.ring.borrow().clone();
		let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
			Some(pc) => &pc.zone,
			None => "",
		};

		// Augment requests with some information used to sort them.
		// The tuples are as follows:
		//         (is another node?, is another zone?, latency, node ID, request future)
		// We store all of these tuples in a vec that we can sort.
		// By sorting this vec, we priorize ourself, then nodes in the same zone,
		// and within a same zone we priorize nodes with the lowest latency.
		let mut nodes = nodes
			.iter()
			.map(|to| {
				let peer_zone = match ring.layout.node_role(to) {
					Some(pc) => &pc.zone,
					None => "",
				};
				let peer_avg_ping = peer_list
					.iter()
					.find(|x| x.id.as_ref() == to.as_slice())
					.and_then(|pi| pi.avg_ping)
					.unwrap_or_else(|| Duration::from_secs(1));
				(
					*to != self.0.our_node_id,
					peer_zone != our_zone,
					peer_avg_ping,
					*to,
				)
			})
			.collect::<Vec<_>>();

		// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
		nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping));

		nodes
			.into_iter()
			.map(|(_, _, _, to)| to)
			.collect::<Vec<_>>()
	}
}