From e91576677e712c07cf9c47b1a0d2cc4d2d1d37cf Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 16 Feb 2024 10:50:41 +0100 Subject: [PATCH] [reconnect-only-current] filter nodes to reconnect to do not try reconnecting to nodes received from consul/kubernetes discovery if they are not currently in the layout --- src/rpc/system.rs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index de44e656..14a101ca 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -725,15 +725,18 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = self.ring.borrow().layout.check().is_err(); - let no_peers = self.peering.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.ring.borrow().layout.num_nodes(); - let bad_peers = self + let n_connected = self .peering .get_peer_list() .iter() .filter(|p| p.is_up()) - .count() != expected_n_nodes; + .count(); + + let not_configured = self.ring.borrow().layout.check().is_err(); + let no_peers = n_connected < self.replication_factor; + + let expected_n_nodes = self.ring.borrow().layout.num_nodes(); + let bad_peers = n_connected != expected_n_nodes; if not_configured || no_peers || bad_peers { info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers); @@ -780,6 +783,14 @@ impl System { } } + if !not_configured && !no_peers { + // If the layout is configured, and we already have some connections + // to other nodes in the cluster, we can skip trying to connect to + // nodes that are not in the cluster layout. + let ring = self.ring.borrow(); + ping_list.retain(|(id, _)| ring.layout.node_ids().contains(&(*id).into())); + } + for (node_id, node_addr) in ping_list { let self2 = self.clone(); tokio::spawn(async move {