forked from lx/netapp
Add logic to handle ping timeouts and other failures
This commit is contained in:
parent
fa7cdf3747
commit
22eaa0f404
1 changed files with 32 additions and 2 deletions
|
@ -9,11 +9,13 @@ use async_trait::async_trait;
|
|||
use log::{debug, info, trace, warn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use tokio::select;
|
||||
use tokio::sync::watch;
|
||||
|
||||
use sodiumoxide::crypto::hash;
|
||||
|
||||
use crate::endpoint::*;
|
||||
use crate::error::*;
|
||||
use crate::netapp::*;
|
||||
use crate::proto::*;
|
||||
use crate::NodeID;
|
||||
|
@ -22,6 +24,8 @@ const CONN_RETRY_INTERVAL: Duration = Duration::from_secs(30);
|
|||
const CONN_MAX_RETRIES: usize = 10;
|
||||
const PING_INTERVAL: Duration = Duration::from_secs(10);
|
||||
const LOOP_DELAY: Duration = Duration::from_secs(1);
|
||||
const PING_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
const FAILED_PING_THRESHOLD: usize = 3;
|
||||
|
||||
// -- Protocol messages --
|
||||
|
||||
|
@ -52,6 +56,7 @@ struct PeerInfoInternal {
|
|||
state: PeerConnState,
|
||||
last_seen: Option<Instant>,
|
||||
ping: VecDeque<Duration>,
|
||||
failed_pings: usize,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
|
@ -178,6 +183,7 @@ impl FullMeshPeeringStrategy {
|
|||
state: PeerConnState::Waiting(0, Instant::now()),
|
||||
last_seen: None,
|
||||
ping: VecDeque::new(),
|
||||
failed_pings: 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
@ -191,6 +197,7 @@ impl FullMeshPeeringStrategy {
|
|||
state: PeerConnState::Ourself,
|
||||
last_seen: None,
|
||||
ping: VecDeque::new(),
|
||||
failed_pings: 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
@ -347,8 +354,28 @@ impl FullMeshPeeringStrategy {
|
|||
hex::encode(&id[..8]),
|
||||
ping_time
|
||||
);
|
||||
match self.ping_endpoint.call(&id, &ping_msg, PRIO_HIGH).await {
|
||||
Err(e) => warn!("Error pinging {}: {}", hex::encode(&id[..8]), e),
|
||||
let ping_response = select! {
|
||||
r = self.ping_endpoint.call(&id, &ping_msg, PRIO_HIGH) => r,
|
||||
_ = tokio::time::sleep(PING_TIMEOUT) => Err(Error::Message("Ping timeout".into())),
|
||||
};
|
||||
|
||||
match ping_response {
|
||||
Err(e) => {
|
||||
warn!("Error pinging {}: {}", hex::encode(&id[..8]), e);
|
||||
let mut known_hosts = self.known_hosts.write().unwrap();
|
||||
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||
host.failed_pings += 1;
|
||||
if host.failed_pings > FAILED_PING_THRESHOLD {
|
||||
warn!(
|
||||
"Too many failed pings from {}, closing connection.",
|
||||
hex::encode(&id[..8])
|
||||
);
|
||||
// this will later update info in known_hosts
|
||||
// through the disconnection handler
|
||||
self.netapp.disconnect(&id);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ping_resp) => {
|
||||
let resp_time = Instant::now();
|
||||
debug!(
|
||||
|
@ -359,6 +386,7 @@ impl FullMeshPeeringStrategy {
|
|||
{
|
||||
let mut known_hosts = self.known_hosts.write().unwrap();
|
||||
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||
host.failed_pings = 0;
|
||||
host.last_seen = Some(resp_time);
|
||||
host.ping.push_back(resp_time - ping_time);
|
||||
while host.ping.len() > 10 {
|
||||
|
@ -455,6 +483,7 @@ impl FullMeshPeeringStrategy {
|
|||
addr,
|
||||
last_seen: None,
|
||||
ping: VecDeque::new(),
|
||||
failed_pings: 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
@ -486,6 +515,7 @@ impl FullMeshPeeringStrategy {
|
|||
state,
|
||||
last_seen: None,
|
||||
ping: VecDeque::new(),
|
||||
failed_pings: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue