2020-04-10 20:01:48 +00:00
|
|
|
use std::collections::HashMap;
|
2021-02-21 14:24:30 +00:00
|
|
|
use std::fmt::Write as FmtWrite;
|
2021-02-23 17:46:25 +00:00
|
|
|
use std::io::{Read, Write};
|
2020-04-10 20:01:48 +00:00
|
|
|
use std::net::{IpAddr, SocketAddr};
|
2020-04-07 14:26:22 +00:00
|
|
|
use std::path::PathBuf;
|
2020-04-23 16:05:43 +00:00
|
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
2020-04-10 20:01:48 +00:00
|
|
|
use std::sync::Arc;
|
2020-04-06 17:55:39 +00:00
|
|
|
use std::time::Duration;
|
|
|
|
|
2020-04-06 19:02:15 +00:00
|
|
|
use futures::future::join_all;
|
2020-04-11 16:51:11 +00:00
|
|
|
use futures::select;
|
|
|
|
use futures_util::future::*;
|
2020-04-18 17:21:34 +00:00
|
|
|
use serde::{Deserialize, Serialize};
|
2020-04-11 16:51:11 +00:00
|
|
|
use tokio::sync::watch;
|
2020-04-11 21:53:32 +00:00
|
|
|
use tokio::sync::Mutex;
|
2020-04-06 17:55:39 +00:00
|
|
|
|
2020-04-24 10:10:01 +00:00
|
|
|
use garage_util::background::BackgroundRunner;
|
|
|
|
use garage_util::data::*;
|
|
|
|
use garage_util::error::Error;
|
2021-04-05 17:55:53 +00:00
|
|
|
use garage_util::persister::Persister;
|
2021-03-15 15:21:41 +00:00
|
|
|
use garage_util::time::*;
|
2020-04-23 17:05:46 +00:00
|
|
|
|
2020-06-30 16:33:14 +00:00
|
|
|
use crate::consul::get_consul_nodes;
|
2021-02-23 17:46:25 +00:00
|
|
|
use crate::ring::*;
|
2020-04-24 10:10:01 +00:00
|
|
|
use crate::rpc_client::*;
|
|
|
|
use crate::rpc_server::*;
|
2020-04-06 17:55:39 +00:00
|
|
|
|
|
|
|
const PING_INTERVAL: Duration = Duration::from_secs(10);
|
2021-04-05 17:55:53 +00:00
|
|
|
const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
|
2020-04-06 17:55:39 +00:00
|
|
|
const PING_TIMEOUT: Duration = Duration::from_secs(2);
|
2020-04-23 16:05:43 +00:00
|
|
|
const MAX_FAILURES_BEFORE_CONSIDERED_DOWN: usize = 5;
|
2020-04-06 17:55:39 +00:00
|
|
|
|
2020-04-19 15:15:48 +00:00
|
|
|
pub const MEMBERSHIP_RPC_PATH: &str = "_membership";
|
|
|
|
|
2020-04-18 17:21:34 +00:00
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
pub enum Message {
|
|
|
|
Ok,
|
|
|
|
Ping(PingMessage),
|
|
|
|
PullStatus,
|
|
|
|
PullConfig,
|
|
|
|
AdvertiseNodesUp(Vec<AdvertisedNode>),
|
|
|
|
AdvertiseConfig(NetworkConfig),
|
|
|
|
}
|
|
|
|
|
|
|
|
impl RpcMessage for Message {}
|
|
|
|
|
2020-04-18 17:30:05 +00:00
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
pub struct PingMessage {
|
2020-12-12 16:58:19 +00:00
|
|
|
id: UUID,
|
|
|
|
rpc_port: u16,
|
2020-04-18 17:30:05 +00:00
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
status_hash: Hash,
|
|
|
|
config_version: u64,
|
2020-04-19 17:08:48 +00:00
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
state_info: StateInfo,
|
2020-04-18 17:30:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
|
|
pub struct AdvertisedNode {
|
|
|
|
pub id: UUID,
|
|
|
|
pub addr: SocketAddr,
|
2020-04-23 16:05:43 +00:00
|
|
|
|
|
|
|
pub is_up: bool,
|
|
|
|
pub last_seen: u64,
|
|
|
|
|
2020-04-19 17:08:48 +00:00
|
|
|
pub state_info: StateInfo,
|
2020-04-18 17:30:05 +00:00
|
|
|
}
|
|
|
|
|
2020-04-06 17:55:39 +00:00
|
|
|
pub struct System {
|
|
|
|
pub id: UUID,
|
|
|
|
|
2021-04-05 17:55:53 +00:00
|
|
|
persist_config: Persister<NetworkConfig>,
|
|
|
|
persist_status: Persister<Vec<AdvertisedNode>>,
|
2020-12-12 16:58:19 +00:00
|
|
|
rpc_local_port: u16,
|
|
|
|
|
|
|
|
state_info: StateInfo,
|
2020-04-19 17:08:48 +00:00
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
rpc_http_client: Arc<RpcHttpClient>,
|
2020-04-18 17:21:34 +00:00
|
|
|
rpc_client: Arc<RpcClient<Message>>,
|
2020-04-06 17:55:39 +00:00
|
|
|
|
2021-02-21 12:11:10 +00:00
|
|
|
pub(crate) status: watch::Receiver<Arc<Status>>,
|
2020-04-11 21:53:32 +00:00
|
|
|
pub ring: watch::Receiver<Arc<Ring>>,
|
|
|
|
|
2021-04-05 17:55:53 +00:00
|
|
|
update_lock: Mutex<Updaters>,
|
2020-04-11 16:51:11 +00:00
|
|
|
|
|
|
|
pub background: Arc<BackgroundRunner>,
|
2020-04-06 17:55:39 +00:00
|
|
|
}
|
|
|
|
|
2021-04-05 17:55:53 +00:00
|
|
|
struct Updaters {
|
|
|
|
update_status: watch::Sender<Arc<Status>>,
|
|
|
|
update_ring: watch::Sender<Arc<Ring>>,
|
|
|
|
}
|
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
pub struct Status {
|
2020-04-23 16:05:43 +00:00
|
|
|
pub nodes: HashMap<UUID, Arc<StatusEntry>>,
|
2020-04-11 21:53:32 +00:00
|
|
|
pub hash: Hash,
|
2020-04-07 15:00:48 +00:00
|
|
|
}
|
|
|
|
|
2020-04-23 16:05:43 +00:00
|
|
|
#[derive(Debug)]
|
2020-04-19 17:08:48 +00:00
|
|
|
pub struct StatusEntry {
|
2020-04-07 15:00:48 +00:00
|
|
|
pub addr: SocketAddr,
|
2020-04-23 16:05:43 +00:00
|
|
|
pub last_seen: u64,
|
|
|
|
pub num_failures: AtomicUsize,
|
2020-04-19 17:08:48 +00:00
|
|
|
pub state_info: StateInfo,
|
|
|
|
}
|
|
|
|
|
2020-04-23 16:05:43 +00:00
|
|
|
impl StatusEntry {
|
|
|
|
pub fn is_up(&self) -> bool {
|
|
|
|
self.num_failures.load(Ordering::SeqCst) < MAX_FAILURES_BEFORE_CONSIDERED_DOWN
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-19 17:08:48 +00:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
pub struct StateInfo {
|
|
|
|
pub hostname: String,
|
2020-04-07 15:00:48 +00:00
|
|
|
}
|
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
impl Status {
|
2020-04-06 20:27:51 +00:00
|
|
|
fn handle_ping(&mut self, ip: IpAddr, info: &PingMessage) -> bool {
|
2020-04-06 20:54:03 +00:00
|
|
|
let addr = SocketAddr::new(ip, info.rpc_port);
|
2020-04-11 21:53:32 +00:00
|
|
|
let old_status = self.nodes.insert(
|
2020-04-21 17:08:42 +00:00
|
|
|
info.id,
|
2020-04-23 16:05:43 +00:00
|
|
|
Arc::new(StatusEntry {
|
2020-04-21 17:08:42 +00:00
|
|
|
addr,
|
2020-04-23 16:05:43 +00:00
|
|
|
last_seen: now_msec(),
|
|
|
|
num_failures: AtomicUsize::from(0),
|
2020-04-19 17:08:48 +00:00
|
|
|
state_info: info.state_info.clone(),
|
2020-04-23 16:05:43 +00:00
|
|
|
}),
|
2020-04-10 20:01:48 +00:00
|
|
|
);
|
2020-04-06 20:54:03 +00:00
|
|
|
match old_status {
|
|
|
|
None => {
|
2020-04-21 12:54:55 +00:00
|
|
|
info!("Newly pingable node: {}", hex::encode(&info.id));
|
2020-04-06 20:54:03 +00:00
|
|
|
true
|
|
|
|
}
|
|
|
|
Some(x) => x.addr != addr,
|
|
|
|
}
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
fn recalculate_hash(&mut self) {
|
|
|
|
let mut nodes = self.nodes.iter().collect::<Vec<_>>();
|
2020-04-07 16:10:20 +00:00
|
|
|
nodes.sort_unstable_by_key(|(id, _status)| *id);
|
2020-04-06 20:27:51 +00:00
|
|
|
|
2021-02-21 14:24:30 +00:00
|
|
|
let mut nodes_txt = String::new();
|
2020-04-21 12:54:55 +00:00
|
|
|
debug!("Current set of pingable nodes: --");
|
2020-04-06 20:27:51 +00:00
|
|
|
for (id, status) in nodes {
|
2020-04-21 12:54:55 +00:00
|
|
|
debug!("{} {}", hex::encode(&id), status.addr);
|
2021-02-21 14:24:30 +00:00
|
|
|
writeln!(&mut nodes_txt, "{} {}", hex::encode(&id), status.addr).unwrap();
|
2020-04-06 19:02:15 +00:00
|
|
|
}
|
2020-04-21 12:54:55 +00:00
|
|
|
debug!("END --");
|
2021-02-23 17:14:37 +00:00
|
|
|
self.hash = blake2sum(nodes_txt.as_bytes());
|
2020-04-06 19:02:15 +00:00
|
|
|
}
|
2021-04-05 17:55:53 +00:00
|
|
|
|
|
|
|
fn to_serializable_membership(&self, system: &System) -> Vec<AdvertisedNode> {
|
|
|
|
let mut mem = vec![];
|
|
|
|
for (node, status) in self.nodes.iter() {
|
|
|
|
let state_info = if *node == system.id {
|
|
|
|
system.state_info.clone()
|
|
|
|
} else {
|
|
|
|
status.state_info.clone()
|
|
|
|
};
|
|
|
|
mem.push(AdvertisedNode {
|
|
|
|
id: *node,
|
|
|
|
addr: status.addr,
|
|
|
|
is_up: status.is_up(),
|
|
|
|
last_seen: status.last_seen,
|
|
|
|
state_info,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
mem
|
|
|
|
}
|
2020-04-11 21:53:32 +00:00
|
|
|
}
|
2020-04-06 19:02:15 +00:00
|
|
|
|
2020-04-23 17:05:46 +00:00
|
|
|
fn gen_node_id(metadata_dir: &PathBuf) -> Result<UUID, Error> {
|
|
|
|
let mut id_file = metadata_dir.clone();
|
|
|
|
id_file.push("node_id");
|
|
|
|
if id_file.as_path().exists() {
|
|
|
|
let mut f = std::fs::File::open(id_file.as_path())?;
|
|
|
|
let mut d = vec![];
|
|
|
|
f.read_to_end(&mut d)?;
|
|
|
|
if d.len() != 32 {
|
|
|
|
return Err(Error::Message(format!("Corrupt node_id file")));
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut id = [0u8; 32];
|
|
|
|
id.copy_from_slice(&d[..]);
|
|
|
|
Ok(id.into())
|
|
|
|
} else {
|
|
|
|
let id = gen_uuid();
|
|
|
|
|
|
|
|
let mut f = std::fs::File::create(id_file.as_path())?;
|
|
|
|
f.write_all(id.as_slice())?;
|
|
|
|
Ok(id)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-06 17:55:39 +00:00
|
|
|
impl System {
|
2020-04-18 17:21:34 +00:00
|
|
|
pub fn new(
|
2020-12-12 16:58:19 +00:00
|
|
|
metadata_dir: PathBuf,
|
2020-04-23 17:05:46 +00:00
|
|
|
rpc_http_client: Arc<RpcHttpClient>,
|
2020-04-18 17:21:34 +00:00
|
|
|
background: Arc<BackgroundRunner>,
|
|
|
|
rpc_server: &mut RpcServer,
|
|
|
|
) -> Arc<Self> {
|
2020-12-12 16:58:19 +00:00
|
|
|
let id = gen_node_id(&metadata_dir).expect("Unable to read or generate node ID");
|
2020-04-23 17:05:46 +00:00
|
|
|
info!("Node ID: {}", hex::encode(&id));
|
|
|
|
|
2021-04-05 17:55:53 +00:00
|
|
|
let persist_config = Persister::new(&metadata_dir, "network_config");
|
|
|
|
let persist_status = Persister::new(&metadata_dir, "peer_info");
|
|
|
|
|
|
|
|
let net_config = match persist_config.load() {
|
2020-04-10 20:01:48 +00:00
|
|
|
Ok(x) => x,
|
|
|
|
Err(e) => {
|
2020-04-21 12:54:55 +00:00
|
|
|
info!(
|
2020-04-10 20:01:48 +00:00
|
|
|
"No valid previous network configuration stored ({}), starting fresh.",
|
|
|
|
e
|
|
|
|
);
|
2021-02-21 12:11:10 +00:00
|
|
|
NetworkConfig::new()
|
2020-04-10 20:01:48 +00:00
|
|
|
}
|
|
|
|
};
|
2021-04-05 17:55:53 +00:00
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
let mut status = Status {
|
|
|
|
nodes: HashMap::new(),
|
|
|
|
hash: Hash::default(),
|
|
|
|
};
|
|
|
|
status.recalculate_hash();
|
|
|
|
let (update_status, status) = watch::channel(Arc::new(status));
|
|
|
|
|
2020-04-19 17:08:48 +00:00
|
|
|
let state_info = StateInfo {
|
|
|
|
hostname: gethostname::gethostname()
|
|
|
|
.into_string()
|
|
|
|
.unwrap_or("<invalid utf-8>".to_string()),
|
|
|
|
};
|
|
|
|
|
2021-03-05 15:22:29 +00:00
|
|
|
let ring = Ring::new(net_config);
|
2020-04-11 21:53:32 +00:00
|
|
|
let (update_ring, ring) = watch::channel(Arc::new(ring));
|
|
|
|
|
2020-04-19 15:15:48 +00:00
|
|
|
let rpc_path = MEMBERSHIP_RPC_PATH.to_string();
|
2020-04-18 17:21:34 +00:00
|
|
|
let rpc_client = RpcClient::new(
|
2020-04-19 15:15:48 +00:00
|
|
|
RpcAddrClient::<Message>::new(rpc_http_client.clone(), rpc_path.clone()),
|
2020-04-18 17:21:34 +00:00
|
|
|
background.clone(),
|
|
|
|
status.clone(),
|
|
|
|
);
|
2020-04-12 13:51:19 +00:00
|
|
|
|
2020-04-18 17:21:34 +00:00
|
|
|
let sys = Arc::new(System {
|
2020-04-06 21:10:28 +00:00
|
|
|
id,
|
2021-04-05 17:55:53 +00:00
|
|
|
persist_config,
|
|
|
|
persist_status,
|
2020-04-23 17:05:46 +00:00
|
|
|
rpc_local_port: rpc_server.bind_addr.port(),
|
2020-04-19 17:08:48 +00:00
|
|
|
state_info,
|
2020-04-18 17:21:34 +00:00
|
|
|
rpc_http_client,
|
2020-04-12 13:51:19 +00:00
|
|
|
rpc_client,
|
2020-04-11 21:53:32 +00:00
|
|
|
status,
|
|
|
|
ring,
|
2021-04-05 17:55:53 +00:00
|
|
|
update_lock: Mutex::new(Updaters {
|
|
|
|
update_status,
|
|
|
|
update_ring,
|
|
|
|
}),
|
2020-04-11 16:51:11 +00:00
|
|
|
background,
|
2020-04-18 17:21:34 +00:00
|
|
|
});
|
2020-04-19 15:15:48 +00:00
|
|
|
sys.clone().register_handler(rpc_server, rpc_path);
|
2020-04-18 17:21:34 +00:00
|
|
|
sys
|
|
|
|
}
|
|
|
|
|
|
|
|
fn register_handler(self: Arc<Self>, rpc_server: &mut RpcServer, path: String) {
|
|
|
|
rpc_server.add_handler::<Message, _, _>(path, move |msg, addr| {
|
|
|
|
let self2 = self.clone();
|
|
|
|
async move {
|
|
|
|
match msg {
|
|
|
|
Message::Ping(ping) => self2.handle_ping(&addr, &ping).await,
|
|
|
|
|
|
|
|
Message::PullStatus => self2.handle_pull_status(),
|
|
|
|
Message::PullConfig => self2.handle_pull_config(),
|
|
|
|
Message::AdvertiseNodesUp(adv) => self2.handle_advertise_nodes_up(&adv).await,
|
|
|
|
Message::AdvertiseConfig(adv) => self2.handle_advertise_config(&adv).await,
|
|
|
|
|
2020-11-08 14:04:30 +00:00
|
|
|
_ => Err(Error::BadRPC(format!("Unexpected RPC message"))),
|
2020-04-18 17:21:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn rpc_client<M: RpcMessage + 'static>(self: &Arc<Self>, path: &str) -> Arc<RpcClient<M>> {
|
|
|
|
RpcClient::new(
|
|
|
|
RpcAddrClient::new(self.rpc_http_client.clone(), path.to_string()),
|
|
|
|
self.background.clone(),
|
|
|
|
self.status.clone(),
|
|
|
|
)
|
2020-04-06 17:55:39 +00:00
|
|
|
}
|
|
|
|
|
2020-04-11 16:51:11 +00:00
|
|
|
async fn save_network_config(self: Arc<Self>) -> Result<(), Error> {
|
2020-04-11 21:53:32 +00:00
|
|
|
let ring = self.ring.borrow().clone();
|
2021-04-05 17:55:53 +00:00
|
|
|
self.persist_config
|
|
|
|
.save_async(&ring.config)
|
|
|
|
.await
|
|
|
|
.expect("Cannot save current cluster configuration");
|
2020-04-11 16:51:11 +00:00
|
|
|
Ok(())
|
2020-04-10 20:01:48 +00:00
|
|
|
}
|
2020-04-07 14:26:22 +00:00
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
fn make_ping(&self) -> Message {
|
2020-04-11 21:53:32 +00:00
|
|
|
let status = self.status.borrow().clone();
|
|
|
|
let ring = self.ring.borrow().clone();
|
2020-04-10 20:01:48 +00:00
|
|
|
Message::Ping(PingMessage {
|
2020-04-21 17:08:42 +00:00
|
|
|
id: self.id,
|
2020-04-23 17:05:46 +00:00
|
|
|
rpc_port: self.rpc_local_port,
|
2020-04-21 17:08:42 +00:00
|
|
|
status_hash: status.hash,
|
2020-04-11 21:53:32 +00:00
|
|
|
config_version: ring.config.version,
|
2020-04-19 17:08:48 +00:00
|
|
|
state_info: self.state_info.clone(),
|
2020-04-06 19:02:15 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
async fn broadcast(self: Arc<Self>, msg: Message, timeout: Duration) {
|
2020-04-11 21:53:32 +00:00
|
|
|
let status = self.status.borrow().clone();
|
|
|
|
let to = status
|
|
|
|
.nodes
|
2020-04-10 20:01:48 +00:00
|
|
|
.keys()
|
|
|
|
.filter(|x| **x != self.id)
|
|
|
|
.cloned()
|
|
|
|
.collect::<Vec<_>>();
|
2020-04-18 17:21:34 +00:00
|
|
|
self.rpc_client.call_many(&to[..], msg, timeout).await;
|
2020-04-06 17:55:39 +00:00
|
|
|
}
|
|
|
|
|
2020-06-30 16:33:14 +00:00
|
|
|
pub async fn bootstrap(
|
|
|
|
self: Arc<Self>,
|
2021-04-05 17:55:53 +00:00
|
|
|
peers: Vec<SocketAddr>,
|
2020-06-30 16:33:14 +00:00
|
|
|
consul_host: Option<String>,
|
|
|
|
consul_service_name: Option<String>,
|
|
|
|
) {
|
2021-04-05 17:55:53 +00:00
|
|
|
let self2 = self.clone();
|
|
|
|
self.background
|
|
|
|
.spawn_worker(format!("discovery loop"), |stop_signal| {
|
|
|
|
self2.discovery_loop(peers, stop_signal)
|
|
|
|
});
|
2020-04-06 22:00:43 +00:00
|
|
|
|
2020-06-30 16:33:14 +00:00
|
|
|
let self2 = self.clone();
|
2021-03-15 22:14:12 +00:00
|
|
|
self.background
|
|
|
|
.spawn_worker(format!("ping loop"), |stop_signal| {
|
|
|
|
self2.ping_loop(stop_signal)
|
|
|
|
});
|
2020-06-30 16:33:14 +00:00
|
|
|
|
|
|
|
if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) {
|
|
|
|
let self2 = self.clone();
|
2021-03-15 22:14:12 +00:00
|
|
|
self.background
|
2020-06-30 16:33:14 +00:00
|
|
|
.spawn_worker(format!("Consul loop"), |stop_signal| {
|
2021-03-15 22:14:12 +00:00
|
|
|
self2.consul_loop(stop_signal, consul_host, consul_service_name)
|
2021-03-11 12:47:21 +00:00
|
|
|
});
|
2020-06-30 16:33:14 +00:00
|
|
|
}
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
|
|
|
|
2020-04-16 12:50:49 +00:00
|
|
|
async fn ping_nodes(self: Arc<Self>, peers: Vec<(SocketAddr, Option<UUID>)>) {
|
2020-04-11 21:53:32 +00:00
|
|
|
let ping_msg = self.make_ping();
|
2020-04-10 20:01:48 +00:00
|
|
|
let ping_resps = join_all(peers.iter().map(|(addr, id_option)| {
|
|
|
|
let sys = self.clone();
|
|
|
|
let ping_msg_ref = &ping_msg;
|
|
|
|
async move {
|
|
|
|
(
|
|
|
|
id_option,
|
2020-04-21 17:08:42 +00:00
|
|
|
addr,
|
2020-04-18 17:21:34 +00:00
|
|
|
sys.rpc_client
|
|
|
|
.by_addr()
|
|
|
|
.call(&addr, ping_msg_ref, PING_TIMEOUT)
|
|
|
|
.await,
|
2020-04-10 20:01:48 +00:00
|
|
|
)
|
|
|
|
}
|
|
|
|
}))
|
|
|
|
.await;
|
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
let update_locked = self.update_lock.lock().await;
|
|
|
|
let mut status: Status = self.status.borrow().as_ref().clone();
|
|
|
|
let ring = self.ring.borrow().clone();
|
2020-04-06 22:00:43 +00:00
|
|
|
|
|
|
|
let mut has_changes = false;
|
|
|
|
let mut to_advertise = vec![];
|
|
|
|
|
|
|
|
for (id_option, addr, ping_resp) in ping_resps {
|
2020-04-23 16:05:43 +00:00
|
|
|
if let Ok(Ok(Message::Ping(info))) = ping_resp {
|
2020-04-11 21:53:32 +00:00
|
|
|
let is_new = status.handle_ping(addr.ip(), &info);
|
2020-04-06 22:00:43 +00:00
|
|
|
if is_new {
|
|
|
|
has_changes = true;
|
2020-04-10 20:01:48 +00:00
|
|
|
to_advertise.push(AdvertisedNode {
|
2020-04-21 17:08:42 +00:00
|
|
|
id: info.id,
|
|
|
|
addr: *addr,
|
2020-04-23 16:05:43 +00:00
|
|
|
is_up: true,
|
|
|
|
last_seen: now_msec(),
|
2020-04-19 17:08:48 +00:00
|
|
|
state_info: info.state_info.clone(),
|
2020-04-06 22:00:43 +00:00
|
|
|
});
|
|
|
|
}
|
2020-04-11 21:53:32 +00:00
|
|
|
if is_new || status.hash != info.status_hash {
|
2020-04-11 16:51:11 +00:00
|
|
|
self.background
|
2020-04-21 17:08:42 +00:00
|
|
|
.spawn_cancellable(self.clone().pull_status(info.id).map(Ok));
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
2020-04-11 21:53:32 +00:00
|
|
|
if is_new || ring.config.version < info.config_version {
|
2020-04-11 16:51:11 +00:00
|
|
|
self.background
|
2020-04-21 17:08:42 +00:00
|
|
|
.spawn_cancellable(self.clone().pull_config(info.id).map(Ok));
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
|
|
|
} else if let Some(id) = id_option {
|
2020-04-23 16:05:43 +00:00
|
|
|
if let Some(st) = status.nodes.get_mut(id) {
|
|
|
|
st.num_failures.fetch_add(1, Ordering::SeqCst);
|
|
|
|
if !st.is_up() {
|
|
|
|
warn!("Node {:?} seems to be down.", id);
|
|
|
|
if !ring.config.members.contains_key(id) {
|
|
|
|
info!("Removing node {:?} from status (not in config and not responding to pings anymore)", id);
|
|
|
|
drop(st);
|
|
|
|
status.nodes.remove(&id);
|
|
|
|
has_changes = true;
|
|
|
|
}
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
|
|
|
}
|
2020-04-06 19:02:15 +00:00
|
|
|
}
|
|
|
|
}
|
2020-04-06 22:00:43 +00:00
|
|
|
if has_changes {
|
2020-04-11 21:53:32 +00:00
|
|
|
status.recalculate_hash();
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
2021-04-05 18:26:01 +00:00
|
|
|
self.update_status(&update_locked, status).await;
|
2020-04-11 21:53:32 +00:00
|
|
|
drop(update_locked);
|
2020-04-06 17:55:39 +00:00
|
|
|
|
2020-04-06 22:00:43 +00:00
|
|
|
if to_advertise.len() > 0 {
|
2020-04-10 20:01:48 +00:00
|
|
|
self.broadcast(Message::AdvertiseNodesUp(to_advertise), PING_TIMEOUT)
|
|
|
|
.await;
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
2020-04-06 19:02:15 +00:00
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
async fn handle_ping(
|
2020-04-10 20:01:48 +00:00
|
|
|
self: Arc<Self>,
|
|
|
|
from: &SocketAddr,
|
|
|
|
ping: &PingMessage,
|
|
|
|
) -> Result<Message, Error> {
|
2020-04-11 21:53:32 +00:00
|
|
|
let update_locked = self.update_lock.lock().await;
|
|
|
|
let mut status: Status = self.status.borrow().as_ref().clone();
|
|
|
|
|
|
|
|
let is_new = status.handle_ping(from.ip(), ping);
|
2020-04-06 20:54:03 +00:00
|
|
|
if is_new {
|
2020-04-11 21:53:32 +00:00
|
|
|
status.recalculate_hash();
|
2020-04-06 20:54:03 +00:00
|
|
|
}
|
2020-04-21 17:08:42 +00:00
|
|
|
let status_hash = status.hash;
|
2020-04-11 21:53:32 +00:00
|
|
|
let config_version = self.ring.borrow().config.version;
|
|
|
|
|
2021-04-05 18:26:01 +00:00
|
|
|
self.update_status(&update_locked, status).await;
|
2020-04-11 21:53:32 +00:00
|
|
|
drop(update_locked);
|
2020-04-06 19:02:15 +00:00
|
|
|
|
2020-04-06 20:27:51 +00:00
|
|
|
if is_new || status_hash != ping.status_hash {
|
2020-04-11 21:53:32 +00:00
|
|
|
self.background
|
2020-04-21 17:08:42 +00:00
|
|
|
.spawn_cancellable(self.clone().pull_status(ping.id).map(Ok));
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
if is_new || config_version < ping.config_version {
|
2020-04-11 21:53:32 +00:00
|
|
|
self.background
|
2020-04-21 17:08:42 +00:00
|
|
|
.spawn_cancellable(self.clone().pull_config(ping.id).map(Ok));
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
Ok(self.make_ping())
|
2020-04-06 19:02:15 +00:00
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
fn handle_pull_status(&self) -> Result<Message, Error> {
|
2021-04-05 17:55:53 +00:00
|
|
|
Ok(Message::AdvertiseNodesUp(
|
|
|
|
self.status.borrow().to_serializable_membership(self),
|
|
|
|
))
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
fn handle_pull_config(&self) -> Result<Message, Error> {
|
2020-04-11 21:53:32 +00:00
|
|
|
let ring = self.ring.borrow().clone();
|
|
|
|
Ok(Message::AdvertiseConfig(ring.config.clone()))
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
async fn handle_advertise_nodes_up(
|
2020-04-10 20:01:48 +00:00
|
|
|
self: Arc<Self>,
|
|
|
|
adv: &[AdvertisedNode],
|
|
|
|
) -> Result<Message, Error> {
|
2020-04-06 22:00:43 +00:00
|
|
|
let mut to_ping = vec![];
|
2020-04-06 20:27:51 +00:00
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
let update_lock = self.update_lock.lock().await;
|
|
|
|
let mut status: Status = self.status.borrow().as_ref().clone();
|
2020-04-06 22:00:43 +00:00
|
|
|
let mut has_changed = false;
|
|
|
|
|
2020-04-06 20:27:51 +00:00
|
|
|
for node in adv.iter() {
|
2020-04-06 22:00:43 +00:00
|
|
|
if node.id == self.id {
|
|
|
|
// learn our own ip address
|
2020-04-23 17:05:46 +00:00
|
|
|
let self_addr = SocketAddr::new(node.addr.ip(), self.rpc_local_port);
|
2020-04-11 21:53:32 +00:00
|
|
|
let old_self = status.nodes.insert(
|
2020-04-21 17:08:42 +00:00
|
|
|
node.id,
|
2020-04-23 16:05:43 +00:00
|
|
|
Arc::new(StatusEntry {
|
2020-04-06 22:00:43 +00:00
|
|
|
addr: self_addr,
|
2020-04-23 16:05:43 +00:00
|
|
|
last_seen: now_msec(),
|
|
|
|
num_failures: AtomicUsize::from(0),
|
2020-04-19 17:08:48 +00:00
|
|
|
state_info: self.state_info.clone(),
|
2020-04-23 16:05:43 +00:00
|
|
|
}),
|
2020-04-10 20:01:48 +00:00
|
|
|
);
|
2020-04-06 22:00:43 +00:00
|
|
|
has_changed = match old_self {
|
|
|
|
None => true,
|
|
|
|
Some(x) => x.addr != self_addr,
|
|
|
|
};
|
2020-04-23 16:05:43 +00:00
|
|
|
} else {
|
|
|
|
let ping_them = match status.nodes.get(&node.id) {
|
|
|
|
// Case 1: new node
|
|
|
|
None => true,
|
|
|
|
// Case 2: the node might have changed address
|
|
|
|
Some(our_node) => node.is_up && !our_node.is_up() && our_node.addr != node.addr,
|
|
|
|
};
|
|
|
|
if ping_them {
|
|
|
|
to_ping.push((node.addr, Some(node.id)));
|
|
|
|
}
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
}
|
2020-04-06 22:00:43 +00:00
|
|
|
if has_changed {
|
2020-04-11 21:53:32 +00:00
|
|
|
status.recalculate_hash();
|
2020-04-06 22:00:43 +00:00
|
|
|
}
|
2021-04-05 18:26:01 +00:00
|
|
|
self.update_status(&update_lock, status).await;
|
2020-04-11 21:53:32 +00:00
|
|
|
drop(update_lock);
|
2020-04-06 22:00:43 +00:00
|
|
|
|
|
|
|
if to_ping.len() > 0 {
|
2020-04-11 21:53:32 +00:00
|
|
|
self.background
|
|
|
|
.spawn_cancellable(self.clone().ping_nodes(to_ping).map(Ok));
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Ok(Message::Ok)
|
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
async fn handle_advertise_config(
|
2020-04-10 20:01:48 +00:00
|
|
|
self: Arc<Self>,
|
|
|
|
adv: &NetworkConfig,
|
|
|
|
) -> Result<Message, Error> {
|
2020-04-11 21:53:32 +00:00
|
|
|
let update_lock = self.update_lock.lock().await;
|
2021-02-21 12:11:10 +00:00
|
|
|
let ring: Arc<Ring> = self.ring.borrow().clone();
|
2020-04-11 21:53:32 +00:00
|
|
|
|
|
|
|
if adv.version > ring.config.version {
|
2021-03-05 15:22:29 +00:00
|
|
|
let ring = Ring::new(adv.clone());
|
2021-04-05 17:55:53 +00:00
|
|
|
update_lock.update_ring.send(Arc::new(ring))?;
|
2020-04-11 21:53:32 +00:00
|
|
|
drop(update_lock);
|
2020-04-07 16:10:20 +00:00
|
|
|
|
2020-04-11 16:51:11 +00:00
|
|
|
self.background.spawn_cancellable(
|
2020-04-10 20:01:48 +00:00
|
|
|
self.clone()
|
2020-04-11 16:51:11 +00:00
|
|
|
.broadcast(Message::AdvertiseConfig(adv.clone()), PING_TIMEOUT)
|
|
|
|
.map(Ok),
|
2020-04-10 20:01:48 +00:00
|
|
|
);
|
2021-04-05 17:55:53 +00:00
|
|
|
|
2021-03-15 22:14:12 +00:00
|
|
|
self.background.spawn(self.clone().save_network_config());
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Ok(Message::Ok)
|
|
|
|
}
|
|
|
|
|
2020-06-30 16:33:14 +00:00
|
|
|
async fn ping_loop(self: Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
|
2021-04-05 17:55:53 +00:00
|
|
|
while !*stop_signal.borrow() {
|
2021-03-15 21:36:41 +00:00
|
|
|
let restart_at = tokio::time::sleep(PING_INTERVAL);
|
2020-04-10 20:01:48 +00:00
|
|
|
|
2020-04-11 21:53:32 +00:00
|
|
|
let status = self.status.borrow().clone();
|
|
|
|
let ping_addrs = status
|
|
|
|
.nodes
|
2020-04-10 20:01:48 +00:00
|
|
|
.iter()
|
|
|
|
.filter(|(id, _)| **id != self.id)
|
2020-04-21 17:08:42 +00:00
|
|
|
.map(|(id, status)| (status.addr, Some(*id)))
|
2020-04-10 20:01:48 +00:00
|
|
|
.collect::<Vec<_>>();
|
2020-04-06 20:27:51 +00:00
|
|
|
|
2020-04-06 22:00:43 +00:00
|
|
|
self.clone().ping_nodes(ping_addrs).await;
|
2020-04-06 20:27:51 +00:00
|
|
|
|
2020-04-11 16:51:11 +00:00
|
|
|
select! {
|
|
|
|
_ = restart_at.fuse() => (),
|
2021-04-05 17:55:53 +00:00
|
|
|
_ = stop_signal.changed().fuse() => (),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn discovery_loop(
|
|
|
|
self: Arc<Self>,
|
|
|
|
bootstrap_peers: Vec<SocketAddr>,
|
|
|
|
mut stop_signal: watch::Receiver<bool>,
|
|
|
|
) {
|
|
|
|
while !*stop_signal.borrow() {
|
|
|
|
let not_configured = self.ring.borrow().config.members.len() == 0;
|
|
|
|
let no_peers = self.status.borrow().nodes.len() < 3;
|
|
|
|
let bad_peers = self
|
|
|
|
.status
|
|
|
|
.borrow()
|
|
|
|
.nodes
|
|
|
|
.iter()
|
|
|
|
.filter(|(_, v)| !v.is_up())
|
|
|
|
.count() != self.ring.borrow().config.members.len();
|
|
|
|
|
|
|
|
if not_configured || no_peers || bad_peers {
|
|
|
|
info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers);
|
|
|
|
|
|
|
|
let mut bp2 = bootstrap_peers
|
|
|
|
.iter()
|
|
|
|
.map(|ip| (*ip, None))
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
2021-04-05 18:33:24 +00:00
|
|
|
match self.persist_status.load_async().await {
|
2021-04-05 17:55:53 +00:00
|
|
|
Ok(peers) => {
|
|
|
|
bp2.extend(peers.iter().map(|x| (x.addr, Some(x.id))));
|
2020-04-11 16:51:11 +00:00
|
|
|
}
|
2021-04-05 17:55:53 +00:00
|
|
|
_ => (),
|
2020-04-11 16:51:11 +00:00
|
|
|
}
|
2021-04-05 17:55:53 +00:00
|
|
|
|
|
|
|
self.clone().ping_nodes(bp2).await;
|
|
|
|
}
|
|
|
|
|
|
|
|
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
|
|
|
|
select! {
|
|
|
|
_ = restart_at.fuse() => (),
|
|
|
|
_ = stop_signal.changed().fuse() => (),
|
2020-04-11 16:51:11 +00:00
|
|
|
}
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-30 16:33:14 +00:00
|
|
|
async fn consul_loop(
|
|
|
|
self: Arc<Self>,
|
|
|
|
mut stop_signal: watch::Receiver<bool>,
|
|
|
|
consul_host: String,
|
|
|
|
consul_service_name: String,
|
|
|
|
) {
|
2021-03-15 22:14:12 +00:00
|
|
|
while !*stop_signal.borrow() {
|
2020-06-30 16:33:14 +00:00
|
|
|
match get_consul_nodes(&consul_host, &consul_service_name).await {
|
|
|
|
Ok(mut node_list) => {
|
|
|
|
let ping_addrs = node_list.drain(..).map(|a| (a, None)).collect::<Vec<_>>();
|
|
|
|
self.clone().ping_nodes(ping_addrs).await;
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
warn!("Could not retrieve node list from Consul: {}", e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-05 17:55:53 +00:00
|
|
|
let restart_at = tokio::time::sleep(DISCOVERY_INTERVAL);
|
2020-06-30 16:33:14 +00:00
|
|
|
select! {
|
|
|
|
_ = restart_at.fuse() => (),
|
2021-03-15 22:14:12 +00:00
|
|
|
_ = stop_signal.changed().fuse() => (),
|
2020-06-30 16:33:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
fn pull_status(
|
2020-04-10 20:01:48 +00:00
|
|
|
self: Arc<Self>,
|
|
|
|
peer: UUID,
|
|
|
|
) -> impl futures::future::Future<Output = ()> + Send + 'static {
|
2020-04-06 20:54:03 +00:00
|
|
|
async move {
|
2020-04-18 17:21:34 +00:00
|
|
|
let resp = self
|
|
|
|
.rpc_client
|
2020-04-23 14:40:59 +00:00
|
|
|
.call(peer, Message::PullStatus, PING_TIMEOUT)
|
2020-04-18 17:21:34 +00:00
|
|
|
.await;
|
2020-04-06 20:54:03 +00:00
|
|
|
if let Ok(Message::AdvertiseNodesUp(nodes)) = resp {
|
|
|
|
let _: Result<_, _> = self.handle_advertise_nodes_up(&nodes).await;
|
|
|
|
}
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-12 16:58:19 +00:00
|
|
|
async fn pull_config(self: Arc<Self>, peer: UUID) {
|
2020-04-18 17:21:34 +00:00
|
|
|
let resp = self
|
|
|
|
.rpc_client
|
2020-04-23 14:40:59 +00:00
|
|
|
.call(peer, Message::PullConfig, PING_TIMEOUT)
|
2020-04-18 17:21:34 +00:00
|
|
|
.await;
|
2020-04-06 20:27:51 +00:00
|
|
|
if let Ok(Message::AdvertiseConfig(config)) = resp {
|
2020-04-06 20:54:03 +00:00
|
|
|
let _: Result<_, _> = self.handle_advertise_config(&config).await;
|
2020-04-06 20:27:51 +00:00
|
|
|
}
|
2020-04-06 19:02:15 +00:00
|
|
|
}
|
2021-04-05 17:55:53 +00:00
|
|
|
|
2021-04-05 18:26:01 +00:00
|
|
|
async fn update_status(self: &Arc<Self>, updaters: &Updaters, status: Status) {
|
|
|
|
if status.hash != self.status.borrow().hash {
|
2021-04-05 18:33:24 +00:00
|
|
|
let mut list = status.to_serializable_membership(&self);
|
|
|
|
|
|
|
|
// Combine with old peer list to make sure no peer is lost
|
|
|
|
match self.persist_status.load_async().await {
|
|
|
|
Ok(old_list) => {
|
|
|
|
for pp in old_list {
|
|
|
|
if !list.iter().any(|np| pp.id == np.id) {
|
|
|
|
list.push(pp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => (),
|
|
|
|
}
|
|
|
|
|
|
|
|
if list.len() > 0 {
|
2021-04-05 18:42:46 +00:00
|
|
|
info!("Persisting new peer list ({} peers)", list.len());
|
2021-04-05 18:35:26 +00:00
|
|
|
self.persist_status
|
|
|
|
.save_async(&list)
|
|
|
|
.await
|
2021-04-05 18:33:24 +00:00
|
|
|
.expect("Unable to persist peer list");
|
|
|
|
}
|
2021-04-05 18:26:01 +00:00
|
|
|
}
|
|
|
|
|
2021-04-05 17:55:53 +00:00
|
|
|
let status = Arc::new(status);
|
|
|
|
updaters
|
|
|
|
.update_status
|
|
|
|
.send(status.clone())
|
|
|
|
.expect("Could not update internal membership status");
|
|
|
|
}
|
2020-04-06 17:55:39 +00:00
|
|
|
}
|