Fix unbounded buffering when one node has slower network #792

Merged
lx merged 2 commits from fix-buffering into main 2024-03-28 12:40:28 +00:00
5 changed files with 80 additions and 45 deletions
Showing only changes of commit 85f580cbde - Show all commits

View file

@ -238,9 +238,15 @@ impl BlockManager {
async fn rpc_get_raw_block_streaming(
&self,
hash: &Hash,
priority: RequestPriority,
order_tag: Option<OrderTag>,
) -> Result<DataBlockStream, Error> {
self.rpc_get_raw_block_internal(hash, order_tag, |stream| async move { Ok(stream) })
self.rpc_get_raw_block_internal(
hash,
priority,
order_tag,
|stream| async move { Ok(stream) },
)
.await
}
@ -249,9 +255,10 @@ impl BlockManager {
pub(crate) async fn rpc_get_raw_block(
&self,
hash: &Hash,
priority: RequestPriority,
order_tag: Option<OrderTag>,
) -> Result<DataBlock, Error> {
self.rpc_get_raw_block_internal(hash, order_tag, |block_stream| async move {
self.rpc_get_raw_block_internal(hash, priority, order_tag, |block_stream| async move {
let (header, stream) = block_stream.into_parts();
read_stream_to_end(stream)
.await
@ -264,6 +271,7 @@ impl BlockManager {
async fn rpc_get_raw_block_internal<F, Fut, T>(
&self,
hash: &Hash,
priority: RequestPriority,
order_tag: Option<OrderTag>,
f: F,
) -> Result<T, Error>
@ -279,7 +287,7 @@ impl BlockManager {
let rpc = self.endpoint.call_streaming(
&node_id,
BlockRpc::GetBlock(*hash, order_tag),
PRIO_NORMAL | PRIO_SECONDARY,
priority,
);
tokio::select! {
res = rpc => {
@ -331,7 +339,9 @@ impl BlockManager {
hash: &Hash,
order_tag: Option<OrderTag>,
) -> Result<ByteStream, Error> {
let block_stream = self.rpc_get_raw_block_streaming(hash, order_tag).await?;
let block_stream = self
.rpc_get_raw_block_streaming(hash, PRIO_NORMAL | PRIO_SECONDARY, order_tag)
.await?;
let (header, stream) = block_stream.into_parts();
match header {
DataBlockHeader::Plain => Ok(stream),

View file

@ -436,7 +436,7 @@ impl BlockResyncManager {
&manager.endpoint,
&need_nodes[..],
put_block_message,
RequestStrategy::with_priority(PRIO_BACKGROUND)
RequestStrategy::with_priority(PRIO_BACKGROUND | PRIO_SECONDARY)
.with_quorum(need_nodes.len()),
)
.await
@ -460,7 +460,9 @@ impl BlockResyncManager {
hash
);
let block_data = manager.rpc_get_raw_block(hash, None).await?;
let block_data = manager
.rpc_get_raw_block(hash, PRIO_BACKGROUND | PRIO_SECONDARY, None)
.await?;
manager.metrics.resync_recv_counter.add(1);

View file

@ -28,12 +28,30 @@ use crate::util::*;
/// The same priority value is given to a request and to its associated response.
pub type RequestPriority = u8;
// Usage of priority levels in Garage:
//
// PRIO_HIGH
// for liveness check events such as pings and important
// reconfiguration events such as layout changes
//
// PRIO_NORMAL
// for standard interactive requests to exchange metadata
//
// PRIO_NORMAL | PRIO_SECONDARY
// for standard interactive requests to exchange block data
//
// PRIO_BACKGROUND
// for background resync requests to exchange metadata
// PRIO_BACKGROUND | PRIO_SECONDARY
// for background resync requests to exchange block data
/// Priority class: high
pub const PRIO_HIGH: RequestPriority = 0x20;
/// Priority class: normal
pub const PRIO_NORMAL: RequestPriority = 0x40;
/// Priority class: background
pub const PRIO_BACKGROUND: RequestPriority = 0x80;
/// Priority: primary among given class
pub const PRIO_PRIMARY: RequestPriority = 0x00;
/// Priority: secondary among given class (ex: `PRIO_HIGH | PRIO_SECONDARY`)

View file

@ -109,7 +109,7 @@ impl SendQueuePriority {
let i = order_vec.iter().take_while(|o2| **o2 < order).count();
order_vec.insert(i, order);
}
self.items.push_front(item);
self.items.push_back(item);
}
fn remove(&mut self, id: RequestID) {
if let Some(i) = self.items.iter().position(|x| x.id == id) {
@ -128,6 +128,10 @@ impl SendQueuePriority {
self.items.is_empty()
}
fn poll_next_ready(&mut self, ctx: &mut Context<'_>) -> Poll<(RequestID, DataFrame)> {
// in step 1: poll only streams that have sent 0 bytes, we want to send them in priority
// as they most likely represent small requests to be sent first
// in step 2: poll all streams
for step in 0..2 {
for (j, item) in self.items.iter_mut().enumerate() {
if let Some(OrderTag(stream, order)) = item.order_tag {
if order > *self.order.get(&stream).unwrap().front().unwrap() {
@ -135,6 +139,10 @@ impl SendQueuePriority {
}
}
if step == 0 && item.sent > 0 {
continue;
}
let mut item_reader = item.data.read_exact_or_eos(MAX_CHUNK_LENGTH as usize);
if let Poll::Ready(bytes_or_err) = Pin::new(&mut item_reader).poll(ctx) {
let id = item.id;
@ -160,21 +168,18 @@ impl SendQueuePriority {
}
// Remove item from sending queue
self.items.remove(j);
} else {
// Move item later in send queue to implement LAS scheduling
// (LAS = Least Attained Service)
for k in j..self.items.len() - 1 {
if self.items[k].sent >= self.items[k + 1].sent {
self.items.swap(k, k + 1);
} else {
break;
}
}
} else if step == 0 {
// Step 0 means that this stream had not sent any bytes yet.
// Now that it has, and it was not an EOS, we know that it is bigger
// than one chunk so move it at the end of the queue.
let item = self.items.remove(j).unwrap();
self.items.push_back(item);
}
return Poll::Ready((id, data_frame));
}
}
}
Poll::Pending
}

View file

@ -190,7 +190,7 @@ impl RecvLoop for ServerConn {
let (prio, resp_enc_result) = match ReqEnc::decode(stream).await {
Ok(req_enc) => (req_enc.prio, self2.recv_handler_aux(req_enc).await),
Err(e) => (PRIO_HIGH, Err(e)),
Err(e) => (PRIO_NORMAL, Err(e)),
};
debug!("server: sending response to {}", id);