Add logic to handle ping timeouts and other failures

This commit is contained in:
Alex 2022-03-15 17:01:51 +01:00
parent fa7cdf3747
commit 22eaa0f404
Signed by untrusted user: lx
GPG key ID: 0E496D15096376BE

View file

@ -9,11 +9,13 @@ use async_trait::async_trait;
use log::{debug, info, trace, warn}; use log::{debug, info, trace, warn};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::select;
use tokio::sync::watch; use tokio::sync::watch;
use sodiumoxide::crypto::hash; use sodiumoxide::crypto::hash;
use crate::endpoint::*; use crate::endpoint::*;
use crate::error::*;
use crate::netapp::*; use crate::netapp::*;
use crate::proto::*; use crate::proto::*;
use crate::NodeID; use crate::NodeID;
@ -22,6 +24,8 @@ const CONN_RETRY_INTERVAL: Duration = Duration::from_secs(30);
const CONN_MAX_RETRIES: usize = 10; const CONN_MAX_RETRIES: usize = 10;
const PING_INTERVAL: Duration = Duration::from_secs(10); const PING_INTERVAL: Duration = Duration::from_secs(10);
const LOOP_DELAY: Duration = Duration::from_secs(1); const LOOP_DELAY: Duration = Duration::from_secs(1);
const PING_TIMEOUT: Duration = Duration::from_secs(5);
const FAILED_PING_THRESHOLD: usize = 3;
// -- Protocol messages -- // -- Protocol messages --
@ -52,6 +56,7 @@ struct PeerInfoInternal {
state: PeerConnState, state: PeerConnState,
last_seen: Option<Instant>, last_seen: Option<Instant>,
ping: VecDeque<Duration>, ping: VecDeque<Duration>,
failed_pings: usize,
} }
#[derive(Copy, Clone, Debug)] #[derive(Copy, Clone, Debug)]
@ -178,6 +183,7 @@ impl FullMeshPeeringStrategy {
state: PeerConnState::Waiting(0, Instant::now()), state: PeerConnState::Waiting(0, Instant::now()),
last_seen: None, last_seen: None,
ping: VecDeque::new(), ping: VecDeque::new(),
failed_pings: 0,
}, },
); );
} }
@ -191,6 +197,7 @@ impl FullMeshPeeringStrategy {
state: PeerConnState::Ourself, state: PeerConnState::Ourself,
last_seen: None, last_seen: None,
ping: VecDeque::new(), ping: VecDeque::new(),
failed_pings: 0,
}, },
); );
} }
@ -347,8 +354,28 @@ impl FullMeshPeeringStrategy {
hex::encode(&id[..8]), hex::encode(&id[..8]),
ping_time ping_time
); );
match self.ping_endpoint.call(&id, &ping_msg, PRIO_HIGH).await { let ping_response = select! {
Err(e) => warn!("Error pinging {}: {}", hex::encode(&id[..8]), e), r = self.ping_endpoint.call(&id, &ping_msg, PRIO_HIGH) => r,
_ = tokio::time::sleep(PING_TIMEOUT) => Err(Error::Message("Ping timeout".into())),
};
match ping_response {
Err(e) => {
warn!("Error pinging {}: {}", hex::encode(&id[..8]), e);
let mut known_hosts = self.known_hosts.write().unwrap();
if let Some(host) = known_hosts.list.get_mut(&id) {
host.failed_pings += 1;
if host.failed_pings > FAILED_PING_THRESHOLD {
warn!(
"Too many failed pings from {}, closing connection.",
hex::encode(&id[..8])
);
// this will later update info in known_hosts
// through the disconnection handler
self.netapp.disconnect(&id);
}
}
}
Ok(ping_resp) => { Ok(ping_resp) => {
let resp_time = Instant::now(); let resp_time = Instant::now();
debug!( debug!(
@ -359,6 +386,7 @@ impl FullMeshPeeringStrategy {
{ {
let mut known_hosts = self.known_hosts.write().unwrap(); let mut known_hosts = self.known_hosts.write().unwrap();
if let Some(host) = known_hosts.list.get_mut(&id) { if let Some(host) = known_hosts.list.get_mut(&id) {
host.failed_pings = 0;
host.last_seen = Some(resp_time); host.last_seen = Some(resp_time);
host.ping.push_back(resp_time - ping_time); host.ping.push_back(resp_time - ping_time);
while host.ping.len() > 10 { while host.ping.len() > 10 {
@ -455,6 +483,7 @@ impl FullMeshPeeringStrategy {
addr, addr,
last_seen: None, last_seen: None,
ping: VecDeque::new(), ping: VecDeque::new(),
failed_pings: 0,
}, },
); );
} }
@ -486,6 +515,7 @@ impl FullMeshPeeringStrategy {
state, state,
last_seen: None, last_seen: None,
ping: VecDeque::new(), ping: VecDeque::new(),
failed_pings: 0,
} }
} }
} }