From df094bd8075332bb765b8b44c9b19cf2485e9ca8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 1 Sep 2022 16:30:44 +0200 Subject: [PATCH] Less strict timeouts --- Cargo.lock | 2 +- src/block/manager.rs | 8 ++++++-- src/rpc/rpc_helper.rs | 2 +- src/rpc/system.rs | 6 +++--- src/table/gc.rs | 3 ++- src/table/sync.rs | 3 ++- src/table/table.rs | 2 +- 7 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c31d697..632c2131 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2176,7 +2176,7 @@ dependencies = [ [[package]] name = "netapp" version = "0.5.0" -source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#22d96929d5416750e1f5889ee6cc16b382293104" +source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#f6ad1d0fab340e77fbfcb3488a98c342d334838e" dependencies = [ "arc-swap", "async-trait", diff --git a/src/block/manager.rs b/src/block/manager.rs index b9f6fc0f..00438648 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -48,10 +48,14 @@ use crate::repair::*; pub const INLINE_THRESHOLD: usize = 3072; // Timeout for RPCs that read and write blocks to remote nodes -const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30); +const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60); // Timeout for RPCs that ask other nodes whether they need a copy // of a given block before we delete it locally -const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5); +// The timeout here is relatively low because we don't want to block +// the entire resync loop when some nodes are not responding. +// Nothing will be deleted if the nodes don't answer the queries, +// we will just retry later. +const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15); // The delay between the time where a resync operation fails // and the time when it is retried, with exponential backoff diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 6c79c502..e9575261 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -31,7 +31,7 @@ use garage_util::metrics::RecordDuration; use crate::metrics::RpcMetrics; use crate::ring::Ring; -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); // Don't allow more than 100 concurrent outgoing RPCs. const MAX_CONCURRENT_REQUESTS: usize = 100; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 5858660e..d7ef2140 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -38,7 +38,7 @@ use crate::rpc_helper::*; const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60); const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); -const PING_TIMEOUT: Duration = Duration::from_secs(2); +const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15); /// Version tag used for version check upon Netapp connection pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007 @@ -561,7 +561,7 @@ impl System { .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), - RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), + RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT), ) .await; @@ -685,7 +685,7 @@ impl System { &self.system_endpoint, peer, SystemRpc::PullClusterLayout, - RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), + RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT), ) .await; if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { diff --git a/src/table/gc.rs b/src/table/gc.rs index 12218d97..6cae9701 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -25,7 +25,8 @@ use crate::replication::*; use crate::schema::*; const TABLE_GC_BATCH_SIZE: usize = 1024; -const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30); +// Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager +const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15); // GC delay for table entries: 1 day (24 hours) // (the delay before the entry is added in the GC todo list diff --git a/src/table/sync.rs b/src/table/sync.rs index b3756a5e..62b88a58 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -24,7 +24,8 @@ use crate::merkle::*; use crate::replication::*; use crate::*; -const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30); +// Sync RPC can contain a lot of data, so have a 1min timeout +const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60); // Do anti-entropy every 10 minutes const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); diff --git a/src/table/table.rs b/src/table/table.rs index 3c211728..51f3837f 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -31,7 +31,7 @@ use crate::schema::*; use crate::sync::*; use crate::util::*; -pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); +pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30); pub struct Table { pub system: Arc,