Add logic to handle ping timeouts and other failures
This commit is contained in:
parent
fa7cdf3747
commit
22eaa0f404
1 changed files with 32 additions and 2 deletions
|
@ -9,11 +9,13 @@ use async_trait::async_trait;
|
||||||
use log::{debug, info, trace, warn};
|
use log::{debug, info, trace, warn};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use tokio::select;
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
|
|
||||||
use sodiumoxide::crypto::hash;
|
use sodiumoxide::crypto::hash;
|
||||||
|
|
||||||
use crate::endpoint::*;
|
use crate::endpoint::*;
|
||||||
|
use crate::error::*;
|
||||||
use crate::netapp::*;
|
use crate::netapp::*;
|
||||||
use crate::proto::*;
|
use crate::proto::*;
|
||||||
use crate::NodeID;
|
use crate::NodeID;
|
||||||
|
@ -22,6 +24,8 @@ const CONN_RETRY_INTERVAL: Duration = Duration::from_secs(30);
|
||||||
const CONN_MAX_RETRIES: usize = 10;
|
const CONN_MAX_RETRIES: usize = 10;
|
||||||
const PING_INTERVAL: Duration = Duration::from_secs(10);
|
const PING_INTERVAL: Duration = Duration::from_secs(10);
|
||||||
const LOOP_DELAY: Duration = Duration::from_secs(1);
|
const LOOP_DELAY: Duration = Duration::from_secs(1);
|
||||||
|
const PING_TIMEOUT: Duration = Duration::from_secs(5);
|
||||||
|
const FAILED_PING_THRESHOLD: usize = 3;
|
||||||
|
|
||||||
// -- Protocol messages --
|
// -- Protocol messages --
|
||||||
|
|
||||||
|
@ -52,6 +56,7 @@ struct PeerInfoInternal {
|
||||||
state: PeerConnState,
|
state: PeerConnState,
|
||||||
last_seen: Option<Instant>,
|
last_seen: Option<Instant>,
|
||||||
ping: VecDeque<Duration>,
|
ping: VecDeque<Duration>,
|
||||||
|
failed_pings: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
#[derive(Copy, Clone, Debug)]
|
||||||
|
@ -178,6 +183,7 @@ impl FullMeshPeeringStrategy {
|
||||||
state: PeerConnState::Waiting(0, Instant::now()),
|
state: PeerConnState::Waiting(0, Instant::now()),
|
||||||
last_seen: None,
|
last_seen: None,
|
||||||
ping: VecDeque::new(),
|
ping: VecDeque::new(),
|
||||||
|
failed_pings: 0,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -191,6 +197,7 @@ impl FullMeshPeeringStrategy {
|
||||||
state: PeerConnState::Ourself,
|
state: PeerConnState::Ourself,
|
||||||
last_seen: None,
|
last_seen: None,
|
||||||
ping: VecDeque::new(),
|
ping: VecDeque::new(),
|
||||||
|
failed_pings: 0,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -347,8 +354,28 @@ impl FullMeshPeeringStrategy {
|
||||||
hex::encode(&id[..8]),
|
hex::encode(&id[..8]),
|
||||||
ping_time
|
ping_time
|
||||||
);
|
);
|
||||||
match self.ping_endpoint.call(&id, &ping_msg, PRIO_HIGH).await {
|
let ping_response = select! {
|
||||||
Err(e) => warn!("Error pinging {}: {}", hex::encode(&id[..8]), e),
|
r = self.ping_endpoint.call(&id, &ping_msg, PRIO_HIGH) => r,
|
||||||
|
_ = tokio::time::sleep(PING_TIMEOUT) => Err(Error::Message("Ping timeout".into())),
|
||||||
|
};
|
||||||
|
|
||||||
|
match ping_response {
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Error pinging {}: {}", hex::encode(&id[..8]), e);
|
||||||
|
let mut known_hosts = self.known_hosts.write().unwrap();
|
||||||
|
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||||
|
host.failed_pings += 1;
|
||||||
|
if host.failed_pings > FAILED_PING_THRESHOLD {
|
||||||
|
warn!(
|
||||||
|
"Too many failed pings from {}, closing connection.",
|
||||||
|
hex::encode(&id[..8])
|
||||||
|
);
|
||||||
|
// this will later update info in known_hosts
|
||||||
|
// through the disconnection handler
|
||||||
|
self.netapp.disconnect(&id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(ping_resp) => {
|
Ok(ping_resp) => {
|
||||||
let resp_time = Instant::now();
|
let resp_time = Instant::now();
|
||||||
debug!(
|
debug!(
|
||||||
|
@ -359,6 +386,7 @@ impl FullMeshPeeringStrategy {
|
||||||
{
|
{
|
||||||
let mut known_hosts = self.known_hosts.write().unwrap();
|
let mut known_hosts = self.known_hosts.write().unwrap();
|
||||||
if let Some(host) = known_hosts.list.get_mut(&id) {
|
if let Some(host) = known_hosts.list.get_mut(&id) {
|
||||||
|
host.failed_pings = 0;
|
||||||
host.last_seen = Some(resp_time);
|
host.last_seen = Some(resp_time);
|
||||||
host.ping.push_back(resp_time - ping_time);
|
host.ping.push_back(resp_time - ping_time);
|
||||||
while host.ping.len() > 10 {
|
while host.ping.len() > 10 {
|
||||||
|
@ -455,6 +483,7 @@ impl FullMeshPeeringStrategy {
|
||||||
addr,
|
addr,
|
||||||
last_seen: None,
|
last_seen: None,
|
||||||
ping: VecDeque::new(),
|
ping: VecDeque::new(),
|
||||||
|
failed_pings: 0,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -486,6 +515,7 @@ impl FullMeshPeeringStrategy {
|
||||||
state,
|
state,
|
||||||
last_seen: None,
|
last_seen: None,
|
||||||
ping: VecDeque::new(),
|
ping: VecDeque::new(),
|
||||||
|
failed_pings: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue