Less strict timeouts

2022-09-01 16:30:44 +02:00 · 2022-09-01 16:30:44 +02:00 · df094bd807
commit df094bd807
parent f3bf34b6a1
7 changed files with 16 additions and 10 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2176,7 +2176,7 @@ dependencies = [
 [[package]]
 name = "netapp"
 version = "0.5.0"
-source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#22d96929d5416750e1f5889ee6cc16b382293104"
+source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#f6ad1d0fab340e77fbfcb3488a98c342d334838e"
 dependencies = [
 "arc-swap",
 "async-trait",
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@ -48,10 +48,14 @@ use crate::repair::*;
 pub const INLINE_THRESHOLD: usize = 3072;

 // Timeout for RPCs that read and write blocks to remote nodes
-const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
+const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60);
 // Timeout for RPCs that ask other nodes whether they need a copy
 // of a given block before we delete it locally
-const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
+// The timeout here is relatively low because we don't want to block
+// the entire resync loop when some nodes are not responding.
+// Nothing will be deleted if the nodes don't answer the queries,
+// we will just retry later.
+const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15);

 // The delay between the time where a resync operation fails
 // and the time when it is retried, with exponential backoff
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@ -31,7 +31,7 @@ use garage_util::metrics::RecordDuration;
 use crate::metrics::RpcMetrics;
 use crate::ring::Ring;

-const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);

 // Don't allow more than 100 concurrent outgoing RPCs.
 const MAX_CONCURRENT_REQUESTS: usize = 100;
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@ -38,7 +38,7 @@ use crate::rpc_helper::*;

 const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
 const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
-const PING_TIMEOUT: Duration = Duration::from_secs(2);
+const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15);

 /// Version tag used for version check upon Netapp connection
 pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
@ -561,7 +561,7 @@ impl System {
 				.broadcast(
 					&self.system_endpoint,
 					SystemRpc::AdvertiseStatus(local_status),
-					RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+					RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
 				)
 				.await;

@ -685,7 +685,7 @@ impl System {
 				&self.system_endpoint,
 				peer,
 				SystemRpc::PullClusterLayout,
-				RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
 			)
 			.await;
 		if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@ -25,7 +25,8 @@ use crate::replication::*;
 use crate::schema::*;

 const TABLE_GC_BATCH_SIZE: usize = 1024;
-const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
+// Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager
+const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15);

 // GC delay for table entries: 1 day (24 hours)
 // (the delay before the entry is added in the GC todo list
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@ -24,7 +24,8 @@ use crate::merkle::*;
 use crate::replication::*;
 use crate::*;

-const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
+// Sync RPC can contain a lot of data, so have a 1min timeout
+const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60);

 // Do anti-entropy every 10 minutes
 const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
--- a/src/table/table.rs
+++ b/src/table/table.rs
@ -31,7 +31,7 @@ use crate::schema::*;
 use crate::sync::*;
 use crate::util::*;

-pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
+pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30);

 pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
 	pub system: Arc<System>,