2021-10-14 09:50:12 +00:00
//! Module containing structs related to membership management
2021-10-15 09:05:09 +00:00
use std ::collections ::HashMap ;
2021-10-14 09:50:12 +00:00
use std ::io ::{ Read , Write } ;
2022-03-06 13:50:00 +00:00
use std ::net ::{ IpAddr , SocketAddr } ;
2022-07-08 11:30:26 +00:00
use std ::path ::{ Path , PathBuf } ;
2023-01-26 14:30:36 +00:00
use std ::sync ::atomic ::Ordering ;
2021-10-15 09:05:09 +00:00
use std ::sync ::{ Arc , RwLock } ;
2021-10-19 14:16:10 +00:00
use std ::time ::{ Duration , Instant } ;
2021-10-14 09:50:12 +00:00
use arc_swap ::ArcSwap ;
use async_trait ::async_trait ;
2023-09-12 12:35:48 +00:00
use futures ::join ;
2021-10-14 09:50:12 +00:00
use serde ::{ Deserialize , Serialize } ;
use sodiumoxide ::crypto ::sign ::ed25519 ;
2023-09-12 12:35:48 +00:00
use tokio ::select ;
2021-10-14 09:50:12 +00:00
use tokio ::sync ::watch ;
use tokio ::sync ::Mutex ;
2021-10-15 09:05:09 +00:00
use netapp ::endpoint ::{ Endpoint , EndpointHandler } ;
2022-07-22 13:20:00 +00:00
use netapp ::message ::* ;
2021-10-14 09:50:12 +00:00
use netapp ::peering ::fullmesh ::FullMeshPeeringStrategy ;
2022-09-14 14:09:38 +00:00
use netapp ::util ::parse_and_resolve_peer_addr_async ;
2021-10-19 14:16:10 +00:00
use netapp ::{ NetApp , NetworkKey , NodeID , NodeKey } ;
2021-10-14 09:50:12 +00:00
2021-10-19 14:16:10 +00:00
use garage_util ::config ::Config ;
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
use garage_util ::config ::KubernetesDiscoveryConfig ;
2021-11-09 11:24:04 +00:00
use garage_util ::data ::* ;
2021-10-19 14:16:10 +00:00
use garage_util ::error ::* ;
2021-10-14 09:50:12 +00:00
use garage_util ::persister ::Persister ;
2021-10-15 09:05:09 +00:00
use garage_util ::time ::* ;
2021-10-14 09:50:12 +00:00
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 17:11:16 +00:00
use crate ::consul ::ConsulDiscovery ;
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-03-06 13:50:00 +00:00
use crate ::kubernetes ::* ;
2021-11-09 11:24:04 +00:00
use crate ::layout ::* ;
2022-12-05 14:28:57 +00:00
use crate ::replication_mode ::* ;
2021-10-14 09:50:12 +00:00
use crate ::ring ::* ;
2021-10-15 09:05:09 +00:00
use crate ::rpc_helper ::* ;
2021-10-14 09:50:12 +00:00
2023-01-09 17:15:55 +00:00
use crate ::system_metrics ::* ;
2021-10-14 09:50:12 +00:00
const DISCOVERY_INTERVAL : Duration = Duration ::from_secs ( 60 ) ;
2021-10-15 09:05:09 +00:00
const STATUS_EXCHANGE_INTERVAL : Duration = Duration ::from_secs ( 10 ) ;
2021-10-14 09:50:12 +00:00
2022-09-07 09:59:56 +00:00
/// Version tag used for version check upon Netapp connection.
/// Cluster nodes with different version tags are deemed
/// incompatible and will refuse to connect.
pub const GARAGE_VERSION_TAG : u64 = 0x6761726167650008 ; // garage 0x0008
2022-02-18 19:39:55 +00:00
2021-10-14 09:50:12 +00:00
/// RPC endpoint used for calls related to membership
pub const SYSTEM_RPC_PATH : & str = " garage_rpc/membership.rs/SystemRpc " ;
/// RPC messages related to membership
#[ derive(Debug, Serialize, Deserialize, Clone) ]
pub enum SystemRpc {
/// Response to successfull advertisements
Ok ,
2021-10-15 09:05:09 +00:00
/// Request to connect to a specific node (in <pubkey>@<host>:<port> format)
Connect ( String ) ,
2021-11-09 11:24:04 +00:00
/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
PullClusterLayout ,
2021-10-14 09:50:12 +00:00
/// Advertise Garage status. Answered with another AdvertiseStatus.
/// Exchanged with every node on a regular basis.
2021-10-15 09:05:09 +00:00
AdvertiseStatus ( NodeStatus ) ,
2021-11-09 11:24:04 +00:00
/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
AdvertiseClusterLayout ( ClusterLayout ) ,
2021-10-14 09:50:12 +00:00
/// Get known nodes states
GetKnownNodes ,
/// Return known nodes
2021-10-15 09:05:09 +00:00
ReturnKnownNodes ( Vec < KnownNodeInfo > ) ,
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
impl Rpc for SystemRpc {
type Response = Result < SystemRpc , Error > ;
2021-10-14 09:50:12 +00:00
}
2023-01-03 13:44:47 +00:00
#[ derive(Serialize, Deserialize) ]
pub struct PeerList ( Vec < ( Uuid , SocketAddr ) > ) ;
impl garage_util ::migrate ::InitialFormat for PeerList { }
2021-10-14 09:50:12 +00:00
/// This node's membership manager
pub struct System {
/// The id of this node
2021-10-15 09:05:09 +00:00
pub id : Uuid ,
2021-10-14 09:50:12 +00:00
2021-11-09 11:24:04 +00:00
persist_cluster_layout : Persister < ClusterLayout > ,
2023-01-03 13:44:47 +00:00
persist_peer_list : Persister < PeerList > ,
2021-10-14 09:50:12 +00:00
2021-10-15 09:05:09 +00:00
local_status : ArcSwap < NodeStatus > ,
node_status : RwLock < HashMap < Uuid , ( u64 , NodeStatus ) > > ,
2021-10-14 09:50:12 +00:00
pub netapp : Arc < NetApp > ,
fullmesh : Arc < FullMeshPeeringStrategy > ,
pub rpc : RpcHelper ,
system_endpoint : Arc < Endpoint < SystemRpc , System > > ,
rpc_listen_addr : SocketAddr ,
2022-10-18 16:38:20 +00:00
#[ cfg(any(feature = " consul-discovery " , feature = " kubernetes-discovery " )) ]
2021-10-15 09:05:09 +00:00
rpc_public_addr : Option < SocketAddr > ,
2022-09-14 14:09:38 +00:00
bootstrap_peers : Vec < String > ,
2022-03-06 13:50:00 +00:00
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 17:11:16 +00:00
consul_discovery : Option < ConsulDiscovery > ,
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-10-18 16:38:20 +00:00
kubernetes_discovery : Option < KubernetesDiscoveryConfig > ,
2023-01-26 14:30:36 +00:00
2023-01-09 17:15:55 +00:00
metrics : SystemMetrics ,
2022-03-06 13:50:00 +00:00
2022-12-05 14:28:57 +00:00
replication_mode : ReplicationMode ,
2021-10-14 09:50:12 +00:00
replication_factor : usize ,
/// The ring
pub ring : watch ::Receiver < Arc < Ring > > ,
update_ring : Mutex < watch ::Sender < Arc < Ring > > > ,
2022-07-08 11:30:26 +00:00
/// Path to metadata directory
pub metadata_dir : PathBuf ,
2023-01-26 14:04:32 +00:00
/// Path to data directory
pub data_dir : PathBuf ,
2021-10-14 09:50:12 +00:00
}
#[ derive(Debug, Clone, Serialize, Deserialize) ]
2021-10-15 09:05:09 +00:00
pub struct NodeStatus {
2021-10-14 09:50:12 +00:00
/// Hostname of the node
pub hostname : String ,
2023-01-26 14:04:32 +00:00
2021-10-14 09:50:12 +00:00
/// Replication factor configured on the node
pub replication_factor : usize ,
2021-11-09 11:24:04 +00:00
/// Cluster layout version
pub cluster_layout_version : u64 ,
/// Hash of cluster layout staging data
pub cluster_layout_staging_hash : Hash ,
2023-01-26 14:04:32 +00:00
/// Disk usage on partition containing metadata directory (tuple: `(avail, total)`)
#[ serde(default) ]
pub meta_disk_avail : Option < ( u64 , u64 ) > ,
/// Disk usage on partition containing data directory (tuple: `(avail, total)`)
#[ serde(default) ]
pub data_disk_avail : Option < ( u64 , u64 ) > ,
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
#[ derive(Debug, Clone, Serialize, Deserialize) ]
pub struct KnownNodeInfo {
pub id : Uuid ,
pub addr : SocketAddr ,
pub is_up : bool ,
2021-10-19 14:16:10 +00:00
pub last_seen_secs_ago : Option < u64 > ,
2021-10-15 09:05:09 +00:00
pub status : NodeStatus ,
}
2022-12-05 14:28:57 +00:00
#[ derive(Debug, Clone, Copy, Serialize, Deserialize) ]
pub struct ClusterHealth {
/// The current health status of the cluster (see below)
pub status : ClusterHealthStatus ,
/// Number of nodes already seen once in the cluster
pub known_nodes : usize ,
/// Number of nodes currently connected
pub connected_nodes : usize ,
/// Number of storage nodes declared in the current layout
pub storage_nodes : usize ,
/// Number of storage nodes currently connected
pub storage_nodes_ok : usize ,
/// Number of partitions in the layout
pub partitions : usize ,
/// Number of partitions for which we have a quorum of connected nodes
pub partitions_quorum : usize ,
/// Number of partitions for which all storage nodes are connected
pub partitions_all_ok : usize ,
}
#[ derive(Debug, Clone, Copy, Serialize, Deserialize) ]
pub enum ClusterHealthStatus {
/// All nodes are available
Healthy ,
/// Some storage nodes are unavailable, but quorum is stil
/// achieved for all partitions
Degraded ,
/// Quorum is not available for some partitions
Unavailable ,
}
2021-10-19 14:16:10 +00:00
pub fn read_node_id ( metadata_dir : & Path ) -> Result < NodeID , Error > {
let mut pubkey_file = metadata_dir . to_path_buf ( ) ;
pubkey_file . push ( " node_key.pub " ) ;
let mut f = std ::fs ::File ::open ( pubkey_file . as_path ( ) ) ? ;
let mut d = vec! [ ] ;
f . read_to_end ( & mut d ) ? ;
if d . len ( ) ! = 32 {
return Err ( Error ::Message ( " Corrupt node_key.pub file " . to_string ( ) ) ) ;
}
let mut key = [ 0 u8 ; 32 ] ;
key . copy_from_slice ( & d [ .. ] ) ;
Ok ( NodeID ::from_slice ( & key [ .. ] ) . unwrap ( ) )
}
pub fn gen_node_key ( metadata_dir : & Path ) -> Result < NodeKey , Error > {
2021-10-15 09:05:09 +00:00
let mut key_file = metadata_dir . to_path_buf ( ) ;
key_file . push ( " node_key " ) ;
if key_file . as_path ( ) . exists ( ) {
let mut f = std ::fs ::File ::open ( key_file . as_path ( ) ) ? ;
2021-10-14 09:50:12 +00:00
let mut d = vec! [ ] ;
f . read_to_end ( & mut d ) ? ;
if d . len ( ) ! = 64 {
2021-10-15 09:05:09 +00:00
return Err ( Error ::Message ( " Corrupt node_key file " . to_string ( ) ) ) ;
2021-10-14 09:50:12 +00:00
}
let mut key = [ 0 u8 ; 64 ] ;
key . copy_from_slice ( & d [ .. ] ) ;
Ok ( NodeKey ::from_slice ( & key [ .. ] ) . unwrap ( ) )
} else {
2021-10-19 14:16:10 +00:00
if ! metadata_dir . exists ( ) {
info! ( " Metadata directory does not exist, creating it. " ) ;
2023-01-26 16:26:32 +00:00
std ::fs ::create_dir ( metadata_dir ) ? ;
2021-10-19 14:16:10 +00:00
}
info! ( " Generating new node key pair. " ) ;
let ( pubkey , key ) = ed25519 ::gen_keypair ( ) ;
{
use std ::os ::unix ::fs ::PermissionsExt ;
let mut f = std ::fs ::File ::create ( key_file . as_path ( ) ) ? ;
let mut perm = f . metadata ( ) ? . permissions ( ) ;
perm . set_mode ( 0o600 ) ;
std ::fs ::set_permissions ( key_file . as_path ( ) , perm ) ? ;
f . write_all ( & key [ .. ] ) ? ;
}
{
let mut pubkey_file = metadata_dir . to_path_buf ( ) ;
pubkey_file . push ( " node_key.pub " ) ;
let mut f2 = std ::fs ::File ::create ( pubkey_file . as_path ( ) ) ? ;
f2 . write_all ( & pubkey [ .. ] ) ? ;
}
2021-10-14 09:50:12 +00:00
2021-10-15 09:05:09 +00:00
Ok ( key )
2021-10-14 09:50:12 +00:00
}
}
impl System {
/// Create this node's membership manager
pub fn new (
network_key : NetworkKey ,
2022-12-05 14:28:57 +00:00
replication_mode : ReplicationMode ,
2021-10-19 14:16:10 +00:00
config : & Config ,
2022-09-13 14:22:23 +00:00
) -> Result < Arc < Self > , Error > {
2022-12-05 14:28:57 +00:00
let replication_factor = replication_mode . replication_factor ( ) ;
2021-10-19 14:16:10 +00:00
let node_key =
gen_node_key ( & config . metadata_dir ) . expect ( " Unable to read or generate node ID " ) ;
2022-02-18 19:39:55 +00:00
info! (
" Node ID of this node: {} " ,
hex ::encode ( & node_key . public_key ( ) [ .. 8 ] )
) ;
2021-10-14 09:50:12 +00:00
2022-09-13 14:22:23 +00:00
let persist_cluster_layout : Persister < ClusterLayout > =
Persister ::new ( & config . metadata_dir , " cluster_layout " ) ;
2021-10-19 14:16:10 +00:00
let persist_peer_list = Persister ::new ( & config . metadata_dir , " peer_list " ) ;
2021-10-14 09:50:12 +00:00
2021-11-09 11:24:04 +00:00
let cluster_layout = match persist_cluster_layout . load ( ) {
2022-09-13 14:22:23 +00:00
Ok ( x ) = > {
if x . replication_factor ! = replication_factor {
return Err ( Error ::Message ( format! (
" Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory. " ,
x . replication_factor ,
replication_factor
) ) ) ;
}
x
}
2021-10-14 09:50:12 +00:00
Err ( e ) = > {
2021-10-15 09:05:09 +00:00
info! (
2021-11-09 11:24:04 +00:00
" No valid previous cluster layout stored ({}), starting fresh. " ,
2021-10-15 09:05:09 +00:00
e
) ;
2021-11-09 11:24:04 +00:00
ClusterLayout ::new ( replication_factor )
2021-10-14 09:50:12 +00:00
}
} ;
2023-01-09 17:15:55 +00:00
let metrics = SystemMetrics ::new ( replication_factor ) ;
2023-01-26 14:30:36 +00:00
let mut local_status = NodeStatus ::initial ( replication_factor , & cluster_layout ) ;
local_status . update_disk_usage ( & config . metadata_dir , & config . data_dir , & metrics ) ;
2021-11-09 11:24:04 +00:00
let ring = Ring ::new ( cluster_layout , replication_factor ) ;
2021-10-14 09:50:12 +00:00
let ( update_ring , ring ) = watch ::channel ( Arc ::new ( ring ) ) ;
2022-09-14 14:09:38 +00:00
let rpc_public_addr = match & config . rpc_public_addr {
Some ( a_str ) = > {
use std ::net ::ToSocketAddrs ;
match a_str . to_socket_addrs ( ) {
Err ( e ) = > {
error! (
" Cannot resolve rpc_public_addr {} from config file: {}. " ,
a_str , e
) ;
None
}
Ok ( a ) = > {
let a = a . collect ::< Vec < _ > > ( ) ;
if a . is_empty ( ) {
error! ( " rpc_public_addr {} resolve to no known IP address " , a_str ) ;
}
if a . len ( ) > 1 {
warn! ( " Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one. " , a ) ;
}
a . into_iter ( ) . next ( )
}
}
}
2022-03-16 11:09:50 +00:00
None = > {
let addr =
get_default_ip ( ) . map ( | ip | SocketAddr ::new ( ip , config . rpc_bind_addr . port ( ) ) ) ;
if let Some ( a ) = addr {
warn! ( " Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible. " , a ) ;
}
addr
}
} ;
2022-09-14 14:09:38 +00:00
if rpc_public_addr . is_none ( ) {
warn! ( " This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication. " ) ;
}
2022-03-16 11:09:50 +00:00
2022-02-18 19:39:55 +00:00
let netapp = NetApp ::new ( GARAGE_VERSION_TAG , network_key , node_key ) ;
2022-09-14 14:09:38 +00:00
let fullmesh = FullMeshPeeringStrategy ::new ( netapp . clone ( ) , vec! [ ] , rpc_public_addr ) ;
2022-09-19 18:12:19 +00:00
if let Some ( ping_timeout ) = config . rpc_ping_timeout_msec {
fullmesh . set_ping_timeout_millis ( ping_timeout ) ;
}
2021-10-14 09:50:12 +00:00
let system_endpoint = netapp . endpoint ( SYSTEM_RPC_PATH . into ( ) ) ;
2022-10-18 17:13:52 +00:00
#[ cfg(feature = " consul-discovery " ) ]
let consul_discovery = match & config . consul_discovery {
Some ( cfg ) = > Some (
ConsulDiscovery ::new ( cfg . clone ( ) )
. ok_or_message ( " Invalid Consul discovery configuration " ) ? ,
) ,
None = > None ,
} ;
2022-10-18 16:38:20 +00:00
#[ cfg(not(feature = " consul-discovery " )) ]
if config . consul_discovery . is_some ( ) {
warn! ( " Consul discovery is not enabled in this build. " ) ;
}
2022-03-16 11:09:50 +00:00
#[ cfg(not(feature = " kubernetes-discovery " )) ]
2022-10-18 16:38:20 +00:00
if config . kubernetes_discovery . is_some ( ) {
2022-03-16 11:09:50 +00:00
warn! ( " Kubernetes discovery is not enabled in this build. " ) ;
}
2021-10-14 09:50:12 +00:00
let sys = Arc ::new ( System {
2021-10-15 09:05:09 +00:00
id : netapp . id . into ( ) ,
2021-11-09 11:24:04 +00:00
persist_cluster_layout ,
2021-10-15 09:05:09 +00:00
persist_peer_list ,
local_status : ArcSwap ::new ( Arc ::new ( local_status ) ) ,
node_status : RwLock ::new ( HashMap ::new ( ) ) ,
2021-10-14 09:50:12 +00:00
netapp : netapp . clone ( ) ,
fullmesh : fullmesh . clone ( ) ,
2022-09-19 18:12:19 +00:00
rpc : RpcHelper ::new (
netapp . id . into ( ) ,
fullmesh ,
ring . clone ( ) ,
config . rpc_timeout_msec . map ( Duration ::from_millis ) ,
) ,
2021-10-14 09:50:12 +00:00
system_endpoint ,
2022-12-05 14:28:57 +00:00
replication_mode ,
2021-10-14 09:50:12 +00:00
replication_factor ,
2021-10-19 14:16:10 +00:00
rpc_listen_addr : config . rpc_bind_addr ,
2022-10-18 16:38:20 +00:00
#[ cfg(any(feature = " consul-discovery " , feature = " kubernetes-discovery " )) ]
2022-03-16 11:09:50 +00:00
rpc_public_addr ,
2021-10-19 14:16:10 +00:00
bootstrap_peers : config . bootstrap_peers . clone ( ) ,
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 17:11:16 +00:00
consul_discovery ,
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-10-18 16:38:20 +00:00
kubernetes_discovery : config . kubernetes_discovery . clone ( ) ,
2023-01-09 17:15:55 +00:00
metrics ,
2022-03-06 13:50:00 +00:00
2021-10-14 09:50:12 +00:00
ring ,
update_ring : Mutex ::new ( update_ring ) ,
2022-07-08 11:30:26 +00:00
metadata_dir : config . metadata_dir . clone ( ) ,
2023-01-26 14:04:32 +00:00
data_dir : config . data_dir . clone ( ) ,
2021-10-14 09:50:12 +00:00
} ) ;
sys . system_endpoint . set_handler ( sys . clone ( ) ) ;
2022-09-13 14:22:23 +00:00
Ok ( sys )
2021-10-14 09:50:12 +00:00
}
/// Perform bootstraping, starting the ping loop
pub async fn run ( self : Arc < Self > , must_exit : watch ::Receiver < bool > ) {
join! (
self . netapp
. clone ( )
. listen ( self . rpc_listen_addr , None , must_exit . clone ( ) ) ,
self . fullmesh . clone ( ) . run ( must_exit . clone ( ) ) ,
self . discovery_loop ( must_exit . clone ( ) ) ,
2021-10-15 09:05:09 +00:00
self . status_exchange_loop ( must_exit . clone ( ) ) ,
2021-10-14 09:50:12 +00:00
) ;
}
2022-05-24 10:16:39 +00:00
// ---- Administrative operations (directly available and
// also available through RPC) ----
pub fn get_known_nodes ( & self ) -> Vec < KnownNodeInfo > {
let node_status = self . node_status . read ( ) . unwrap ( ) ;
let known_nodes = self
. fullmesh
. get_peer_list ( )
. iter ( )
. map ( | n | KnownNodeInfo {
id : n . id . into ( ) ,
addr : n . addr ,
is_up : n . is_up ( ) ,
2022-09-29 13:53:54 +00:00
last_seen_secs_ago : n
. last_seen
. map ( | t | ( Instant ::now ( ) . saturating_duration_since ( t ) ) . as_secs ( ) ) ,
2022-05-24 10:16:39 +00:00
status : node_status
. get ( & n . id . into ( ) )
. cloned ( )
. map ( | ( _ , st ) | st )
2023-01-26 16:26:32 +00:00
. unwrap_or_else ( NodeStatus ::unknown ) ,
2022-05-24 10:16:39 +00:00
} )
. collect ::< Vec < _ > > ( ) ;
known_nodes
}
pub fn get_cluster_layout ( & self ) -> ClusterLayout {
self . ring . borrow ( ) . layout . clone ( )
}
pub async fn update_cluster_layout (
self : & Arc < Self > ,
layout : & ClusterLayout ,
) -> Result < ( ) , Error > {
self . handle_advertise_cluster_layout ( layout ) . await ? ;
Ok ( ( ) )
}
pub async fn connect ( & self , node : & str ) -> Result < ( ) , Error > {
2022-09-14 14:09:38 +00:00
let ( pubkey , addrs ) = parse_and_resolve_peer_addr_async ( node )
. await
. ok_or_else ( | | {
Error ::Message ( format! (
" Unable to parse or resolve node specification: {} " ,
node
) )
} ) ? ;
2022-05-24 10:16:39 +00:00
let mut errors = vec! [ ] ;
2022-12-14 11:57:33 +00:00
for addr in addrs . iter ( ) {
2022-12-14 15:11:19 +00:00
match self . netapp . clone ( ) . try_connect ( * addr , pubkey ) . await {
2022-05-24 10:16:39 +00:00
Ok ( ( ) ) = > return Ok ( ( ) ) ,
Err ( e ) = > {
2022-12-14 15:11:19 +00:00
errors . push ( (
* addr ,
Error ::Message ( connect_error_message ( * addr , pubkey , e ) ) ,
) ) ;
2022-05-24 10:16:39 +00:00
}
}
}
if errors . len ( ) = = 1 {
Err ( Error ::Message ( errors [ 0 ] . 1. to_string ( ) ) )
} else {
Err ( Error ::Message ( format! ( " {:?} " , errors ) ) )
}
}
2022-12-05 14:28:57 +00:00
pub fn health ( & self ) -> ClusterHealth {
let ring : Arc < _ > = self . ring . borrow ( ) . clone ( ) ;
let quorum = self . replication_mode . write_quorum ( ) ;
let replication_factor = self . replication_factor ;
let nodes = self
. get_known_nodes ( )
. into_iter ( )
. map ( | n | ( n . id , n ) )
. collect ::< HashMap < Uuid , _ > > ( ) ;
let connected_nodes = nodes . iter ( ) . filter ( | ( _ , n ) | n . is_up ) . count ( ) ;
let storage_nodes = ring
. layout
. roles
. items ( )
. iter ( )
. filter ( | ( _ , _ , v ) | matches! ( v , NodeRoleV ( Some ( r ) ) if r . capacity . is_some ( ) ) )
. collect ::< Vec < _ > > ( ) ;
let storage_nodes_ok = storage_nodes
. iter ( )
. filter ( | ( x , _ , _ ) | nodes . get ( x ) . map ( | n | n . is_up ) . unwrap_or ( false ) )
. count ( ) ;
let partitions = ring . partitions ( ) ;
let partitions_n_up = partitions
. iter ( )
. map ( | ( _ , h ) | {
let pn = ring . get_nodes ( h , ring . replication_factor ) ;
pn . iter ( )
. filter ( | x | nodes . get ( x ) . map ( | n | n . is_up ) . unwrap_or ( false ) )
. count ( )
} )
. collect ::< Vec < usize > > ( ) ;
let partitions_all_ok = partitions_n_up
. iter ( )
. filter ( | c | * * c = = replication_factor )
. count ( ) ;
let partitions_quorum = partitions_n_up . iter ( ) . filter ( | c | * * c > = quorum ) . count ( ) ;
let status =
if partitions_quorum = = partitions . len ( ) & & storage_nodes_ok = = storage_nodes . len ( ) {
ClusterHealthStatus ::Healthy
} else if partitions_quorum = = partitions . len ( ) {
ClusterHealthStatus ::Degraded
} else {
ClusterHealthStatus ::Unavailable
} ;
ClusterHealth {
status ,
known_nodes : nodes . len ( ) ,
connected_nodes ,
storage_nodes : storage_nodes . len ( ) ,
storage_nodes_ok ,
partitions : partitions . len ( ) ,
partitions_quorum ,
partitions_all_ok ,
}
}
2021-10-14 09:50:12 +00:00
// ---- INTERNALS ----
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2023-01-03 15:55:59 +00:00
async fn advertise_to_consul ( self : Arc < Self > ) {
2022-03-16 11:09:50 +00:00
let c = match & self . consul_discovery {
Some ( c ) = > c ,
2023-01-03 15:55:59 +00:00
_ = > return ,
2022-03-16 11:09:50 +00:00
} ;
2021-10-15 09:05:09 +00:00
let rpc_public_addr = match self . rpc_public_addr {
Some ( addr ) = > addr ,
None = > {
2022-03-16 11:09:50 +00:00
warn! ( " Not advertising to Consul because rpc_public_addr is not defined in config file and could not be autodetected. " ) ;
2023-01-03 15:55:59 +00:00
return ;
2021-10-15 09:05:09 +00:00
}
} ;
2023-01-03 15:55:59 +00:00
if let Err ( e ) = c
. publish_consul_service (
self . netapp . id ,
& self . local_status . load_full ( ) . hostname ,
rpc_public_addr ,
)
. await
{
error! ( " Error while publishing Consul service: {} " , e ) ;
}
2021-10-15 09:05:09 +00:00
}
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2023-01-03 15:55:59 +00:00
async fn advertise_to_kubernetes ( self : Arc < Self > ) {
2022-03-16 11:09:50 +00:00
let k = match & self . kubernetes_discovery {
Some ( k ) = > k ,
2023-01-03 15:55:59 +00:00
_ = > return ,
2022-03-16 11:09:50 +00:00
} ;
let rpc_public_addr = match self . rpc_public_addr {
Some ( addr ) = > addr ,
None = > {
warn! ( " Not advertising to Kubernetes because rpc_public_addr is not defined in config file and could not be autodetected. " ) ;
2023-01-03 15:55:59 +00:00
return ;
2022-03-16 11:09:50 +00:00
}
} ;
2022-03-06 13:50:00 +00:00
2023-01-03 15:55:59 +00:00
if let Err ( e ) = publish_kubernetes_node (
2022-10-18 16:38:20 +00:00
k ,
2022-03-06 13:50:00 +00:00
self . netapp . id ,
& self . local_status . load_full ( ) . hostname ,
rpc_public_addr ,
)
. await
2023-01-03 15:55:59 +00:00
{
error! ( " Error while publishing node to Kubernetes: {} " , e ) ;
}
2022-03-06 13:50:00 +00:00
}
2021-10-14 09:50:12 +00:00
/// Save network configuration to disc
2022-12-14 11:51:16 +00:00
async fn save_cluster_layout ( & self ) -> Result < ( ) , Error > {
2021-10-14 09:50:12 +00:00
let ring : Arc < Ring > = self . ring . borrow ( ) . clone ( ) ;
2021-11-09 11:24:04 +00:00
self . persist_cluster_layout
. save_async ( & ring . layout )
2021-10-14 09:50:12 +00:00
. await
2021-11-09 11:24:04 +00:00
. expect ( " Cannot save current cluster layout " ) ;
2021-10-14 09:50:12 +00:00
Ok ( ( ) )
}
2021-10-15 09:05:09 +00:00
fn update_local_status ( & self ) {
let mut new_si : NodeStatus = self . local_status . load ( ) . as_ref ( ) . clone ( ) ;
2021-10-14 09:50:12 +00:00
let ring = self . ring . borrow ( ) ;
2021-11-09 11:24:04 +00:00
new_si . cluster_layout_version = ring . layout . version ;
new_si . cluster_layout_staging_hash = ring . layout . staging_hash ;
2023-01-26 14:04:32 +00:00
2023-01-26 14:30:36 +00:00
new_si . update_disk_usage ( & self . metadata_dir , & self . data_dir , & self . metrics ) ;
2023-01-26 14:04:32 +00:00
2021-10-15 09:05:09 +00:00
self . local_status . swap ( Arc ::new ( new_si ) ) ;
}
2022-05-24 10:16:39 +00:00
// --- RPC HANDLERS ---
2021-10-15 09:05:09 +00:00
async fn handle_connect ( & self , node : & str ) -> Result < SystemRpc , Error > {
2022-05-24 10:16:39 +00:00
self . connect ( node ) . await ? ;
Ok ( SystemRpc ::Ok )
2021-10-14 09:50:12 +00:00
}
2021-11-09 11:24:04 +00:00
fn handle_pull_cluster_layout ( & self ) -> SystemRpc {
2021-10-14 09:50:12 +00:00
let ring = self . ring . borrow ( ) . clone ( ) ;
2021-11-09 11:24:04 +00:00
SystemRpc ::AdvertiseClusterLayout ( ring . layout . clone ( ) )
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
fn handle_get_known_nodes ( & self ) -> SystemRpc {
2022-05-24 10:16:39 +00:00
let known_nodes = self . get_known_nodes ( ) ;
2021-10-15 09:05:09 +00:00
SystemRpc ::ReturnKnownNodes ( known_nodes )
}
async fn handle_advertise_status (
self : & Arc < Self > ,
from : Uuid ,
info : & NodeStatus ,
) -> Result < SystemRpc , Error > {
let local_info = self . local_status . load ( ) ;
if local_info . replication_factor < info . replication_factor {
2022-09-13 14:22:23 +00:00
error! ( " Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety. " ,
2021-10-15 09:05:09 +00:00
info . replication_factor ,
local_info . replication_factor ) ;
std ::process ::exit ( 1 ) ;
}
2021-11-09 11:24:04 +00:00
if info . cluster_layout_version > local_info . cluster_layout_version
| | info . cluster_layout_staging_hash ! = local_info . cluster_layout_staging_hash
{
2022-12-14 15:08:05 +00:00
tokio ::spawn ( self . clone ( ) . pull_cluster_layout ( from ) ) ;
2021-10-15 09:05:09 +00:00
}
self . node_status
. write ( )
. unwrap ( )
. insert ( from , ( now_msec ( ) , info . clone ( ) ) ) ;
Ok ( SystemRpc ::Ok )
}
2021-11-09 11:24:04 +00:00
async fn handle_advertise_cluster_layout (
2022-05-24 10:16:39 +00:00
self : & Arc < Self > ,
2021-11-09 11:24:04 +00:00
adv : & ClusterLayout ,
2021-10-14 09:50:12 +00:00
) -> Result < SystemRpc , Error > {
2022-09-13 14:22:23 +00:00
if adv . replication_factor ! = self . replication_factor {
let msg = format! (
" Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received. " ,
adv . replication_factor ,
self . replication_factor
) ;
error! ( " {} " , msg ) ;
return Err ( Error ::Message ( msg ) ) ;
}
2021-10-14 09:50:12 +00:00
let update_ring = self . update_ring . lock ( ) . await ;
2021-11-09 11:24:04 +00:00
let mut layout : ClusterLayout = self . ring . borrow ( ) . layout . clone ( ) ;
let prev_layout_check = layout . check ( ) ;
if layout . merge ( adv ) {
if prev_layout_check & & ! layout . check ( ) {
error! ( " New cluster layout is invalid, discarding. " ) ;
return Err ( Error ::Message (
" New cluster layout is invalid, discarding. " . into ( ) ,
) ) ;
}
2021-10-14 09:50:12 +00:00
2021-11-09 11:24:04 +00:00
let ring = Ring ::new ( layout . clone ( ) , self . replication_factor ) ;
2021-10-14 09:50:12 +00:00
update_ring . send ( Arc ::new ( ring ) ) ? ;
drop ( update_ring ) ;
let self2 = self . clone ( ) ;
2022-12-14 15:08:05 +00:00
tokio ::spawn ( async move {
if let Err ( e ) = self2
2021-10-14 09:50:12 +00:00
. rpc
. broadcast (
& self2 . system_endpoint ,
2021-11-09 11:24:04 +00:00
SystemRpc ::AdvertiseClusterLayout ( layout ) ,
2021-10-19 14:16:10 +00:00
RequestStrategy ::with_priority ( PRIO_HIGH ) ,
2021-10-14 09:50:12 +00:00
)
2022-12-14 15:08:05 +00:00
. await
{
warn! ( " Error while broadcasting new cluster layout: {} " , e ) ;
}
2021-10-14 09:50:12 +00:00
} ) ;
2022-12-14 11:51:16 +00:00
self . save_cluster_layout ( ) . await ? ;
2021-10-14 09:50:12 +00:00
}
Ok ( SystemRpc ::Ok )
}
2021-10-15 09:05:09 +00:00
async fn status_exchange_loop ( & self , mut stop_signal : watch ::Receiver < bool > ) {
while ! * stop_signal . borrow ( ) {
2023-09-12 12:35:48 +00:00
let restart_at = Instant ::now ( ) + STATUS_EXCHANGE_INTERVAL ;
2021-10-15 09:05:09 +00:00
self . update_local_status ( ) ;
let local_status : NodeStatus = self . local_status . load ( ) . as_ref ( ) . clone ( ) ;
2022-07-22 13:20:00 +00:00
let _ = self
. rpc
2021-10-15 09:05:09 +00:00
. broadcast (
& self . system_endpoint ,
SystemRpc ::AdvertiseStatus ( local_status ) ,
2023-09-12 12:35:48 +00:00
RequestStrategy ::with_priority ( PRIO_HIGH )
. with_custom_timeout ( STATUS_EXCHANGE_INTERVAL ) ,
2021-10-15 09:05:09 +00:00
)
. await ;
select! {
2023-09-12 12:35:48 +00:00
_ = tokio ::time ::sleep_until ( restart_at . into ( ) ) = > { } ,
_ = stop_signal . changed ( ) = > { } ,
2021-10-15 09:05:09 +00:00
}
}
}
async fn discovery_loop ( self : & Arc < Self > , mut stop_signal : watch ::Receiver < bool > ) {
2021-10-14 09:50:12 +00:00
while ! * stop_signal . borrow ( ) {
2021-11-09 11:24:04 +00:00
let not_configured = ! self . ring . borrow ( ) . layout . check ( ) ;
2021-10-14 09:50:12 +00:00
let no_peers = self . fullmesh . get_peer_list ( ) . len ( ) < self . replication_factor ;
2021-11-09 11:24:04 +00:00
let expected_n_nodes = self . ring . borrow ( ) . layout . num_nodes ( ) ;
2021-10-14 09:50:12 +00:00
let bad_peers = self
. fullmesh
. get_peer_list ( )
. iter ( )
. filter ( | p | p . is_up ( ) )
2021-11-09 11:24:04 +00:00
. count ( ) ! = expected_n_nodes ;
2021-10-14 09:50:12 +00:00
if not_configured | | no_peers | | bad_peers {
info! ( " Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {}) " , not_configured , no_peers , bad_peers ) ;
2022-09-14 14:09:38 +00:00
let mut ping_list = resolve_peers ( & self . bootstrap_peers ) . await ;
2021-10-14 09:50:12 +00:00
2021-10-15 09:05:09 +00:00
// Add peer list from list stored on disk
if let Ok ( peers ) = self . persist_peer_list . load_async ( ) . await {
2023-01-03 13:44:47 +00:00
ping_list . extend ( peers . 0. iter ( ) . map ( | ( id , addr ) | ( ( * id ) . into ( ) , * addr ) ) )
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
// Fetch peer list from Consul
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-03-16 11:09:50 +00:00
if let Some ( c ) = & self . consul_discovery {
2022-10-18 17:11:16 +00:00
match c . get_consul_nodes ( ) . await {
2021-10-14 09:50:12 +00:00
Ok ( node_list ) = > {
2021-10-15 09:05:09 +00:00
ping_list . extend ( node_list ) ;
2021-10-14 09:50:12 +00:00
}
Err ( e ) = > {
warn! ( " Could not retrieve node list from Consul: {} " , e ) ;
}
}
}
2022-03-06 13:50:00 +00:00
// Fetch peer list from Kubernetes
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
if let Some ( k ) = & self . kubernetes_discovery {
if ! k . skip_crd {
2022-03-06 13:50:00 +00:00
match create_kubernetes_crd ( ) . await {
Ok ( ( ) ) = > ( ) ,
Err ( e ) = > {
error! ( " Failed to create kubernetes custom resource: {} " , e )
}
} ;
}
2022-10-18 16:38:20 +00:00
match get_kubernetes_nodes ( k ) . await {
2022-03-06 13:50:00 +00:00
Ok ( node_list ) = > {
ping_list . extend ( node_list ) ;
}
Err ( e ) = > {
warn! ( " Could not retrieve node list from Kubernetes: {} " , e ) ;
}
}
}
2021-10-14 09:50:12 +00:00
for ( node_id , node_addr ) in ping_list {
2022-12-14 15:08:05 +00:00
let self2 = self . clone ( ) ;
tokio ::spawn ( async move {
if let Err ( e ) = self2 . netapp . clone ( ) . try_connect ( node_addr , node_id ) . await {
2022-12-14 15:11:19 +00:00
error! ( " {} " , connect_error_message ( node_addr , node_id , e ) ) ;
2022-12-14 15:08:05 +00:00
}
} ) ;
2021-10-14 09:50:12 +00:00
}
}
2021-11-03 16:34:44 +00:00
if let Err ( e ) = self . save_peer_list ( ) . await {
2021-10-15 09:05:09 +00:00
warn! ( " Could not save peer list to file: {} " , e ) ;
}
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2023-01-03 15:55:59 +00:00
tokio ::spawn ( self . clone ( ) . advertise_to_consul ( ) ) ;
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2023-01-03 15:55:59 +00:00
tokio ::spawn ( self . clone ( ) . advertise_to_kubernetes ( ) ) ;
2021-10-15 09:05:09 +00:00
2021-10-14 09:50:12 +00:00
select! {
2023-09-12 12:35:48 +00:00
_ = tokio ::time ::sleep ( DISCOVERY_INTERVAL ) = > { } ,
_ = stop_signal . changed ( ) = > { } ,
2021-10-14 09:50:12 +00:00
}
}
}
2021-11-03 16:34:44 +00:00
async fn save_peer_list ( & self ) -> Result < ( ) , Error > {
// Prepare new peer list to save to file
// It is a vec of tuples (node ID as Uuid, node SocketAddr)
let mut peer_list = self
. fullmesh
. get_peer_list ( )
. iter ( )
. map ( | n | ( n . id . into ( ) , n . addr ) )
. collect ::< Vec < _ > > ( ) ;
// Before doing it, we read the current peer list file (if it exists)
// and append it to the list we are about to save,
// so that no peer ID gets lost in the process.
if let Ok ( mut prev_peer_list ) = self . persist_peer_list . load_async ( ) . await {
2023-01-03 13:44:47 +00:00
prev_peer_list
. 0
. retain ( | ( id , _ip ) | peer_list . iter ( ) . all ( | ( id2 , _ip2 ) | id2 ! = id ) ) ;
peer_list . extend ( prev_peer_list . 0 ) ;
2021-11-03 16:34:44 +00:00
}
// Save new peer list to file
2023-01-03 13:44:47 +00:00
self . persist_peer_list
. save_async ( & PeerList ( peer_list ) )
. await
2021-11-03 16:34:44 +00:00
}
2021-11-09 11:24:04 +00:00
async fn pull_cluster_layout ( self : Arc < Self > , peer : Uuid ) {
2021-10-14 09:50:12 +00:00
let resp = self
. rpc
. call (
& self . system_endpoint ,
peer ,
2021-11-09 11:24:04 +00:00
SystemRpc ::PullClusterLayout ,
2022-09-19 18:12:19 +00:00
RequestStrategy ::with_priority ( PRIO_HIGH ) ,
2021-10-14 09:50:12 +00:00
)
. await ;
2021-11-09 11:24:04 +00:00
if let Ok ( SystemRpc ::AdvertiseClusterLayout ( layout ) ) = resp {
let _ : Result < _ , _ > = self . handle_advertise_cluster_layout ( & layout ) . await ;
2021-10-14 09:50:12 +00:00
}
}
}
#[ async_trait ]
impl EndpointHandler < SystemRpc > for System {
2021-10-15 09:05:09 +00:00
async fn handle ( self : & Arc < Self > , msg : & SystemRpc , from : NodeID ) -> Result < SystemRpc , Error > {
match msg {
SystemRpc ::Connect ( node ) = > self . handle_connect ( node ) . await ,
2021-11-09 11:24:04 +00:00
SystemRpc ::PullClusterLayout = > Ok ( self . handle_pull_cluster_layout ( ) ) ,
2021-10-15 09:05:09 +00:00
SystemRpc ::AdvertiseStatus ( adv ) = > self . handle_advertise_status ( from . into ( ) , adv ) . await ,
2021-11-09 11:24:04 +00:00
SystemRpc ::AdvertiseClusterLayout ( adv ) = > {
self . clone ( ) . handle_advertise_cluster_layout ( adv ) . await
}
2021-10-15 09:05:09 +00:00
SystemRpc ::GetKnownNodes = > Ok ( self . handle_get_known_nodes ( ) ) ,
2022-01-03 12:58:05 +00:00
m = > Err ( Error ::unexpected_rpc_message ( m ) ) ,
2021-10-14 09:50:12 +00:00
}
}
}
2022-03-16 11:09:50 +00:00
2023-01-26 14:04:32 +00:00
impl NodeStatus {
fn initial ( replication_factor : usize , layout : & ClusterLayout ) -> Self {
NodeStatus {
hostname : gethostname ::gethostname ( )
. into_string ( )
. unwrap_or_else ( | _ | " <invalid utf-8> " . to_string ( ) ) ,
replication_factor ,
cluster_layout_version : layout . version ,
cluster_layout_staging_hash : layout . staging_hash ,
meta_disk_avail : None ,
data_disk_avail : None ,
}
}
fn unknown ( ) -> Self {
NodeStatus {
hostname : " ? " . to_string ( ) ,
replication_factor : 0 ,
cluster_layout_version : 0 ,
cluster_layout_staging_hash : Hash ::from ( [ 0 u8 ; 32 ] ) ,
meta_disk_avail : None ,
data_disk_avail : None ,
}
}
2023-01-26 14:30:36 +00:00
fn update_disk_usage ( & mut self , meta_dir : & Path , data_dir : & Path , metrics : & SystemMetrics ) {
2023-09-11 17:08:24 +00:00
use nix ::sys ::statvfs ::statvfs ;
let mount_avail = | path : & Path | match statvfs ( path ) {
Ok ( x ) = > {
2023-09-11 18:01:29 +00:00
let avail = x . blocks_available ( ) * x . fragment_size ( ) as u64 ;
let total = x . blocks ( ) * x . fragment_size ( ) as u64 ;
2023-09-11 17:08:24 +00:00
Some ( ( avail , total ) )
}
Err ( _ ) = > None ,
2023-01-26 14:04:32 +00:00
} ;
self . meta_disk_avail = mount_avail ( meta_dir ) ;
self . data_disk_avail = mount_avail ( data_dir ) ;
2023-01-26 14:30:36 +00:00
if let Some ( ( avail , total ) ) = self . meta_disk_avail {
metrics
. values
. meta_disk_avail
. store ( avail , Ordering ::Relaxed ) ;
metrics
. values
. meta_disk_total
. store ( total , Ordering ::Relaxed ) ;
}
if let Some ( ( avail , total ) ) = self . data_disk_avail {
metrics
. values
. data_disk_avail
. store ( avail , Ordering ::Relaxed ) ;
metrics
. values
. data_disk_total
. store ( total , Ordering ::Relaxed ) ;
}
2023-01-26 14:04:32 +00:00
}
}
2022-03-16 11:09:50 +00:00
fn get_default_ip ( ) -> Option < IpAddr > {
pnet_datalink ::interfaces ( )
. iter ( )
. find ( | e | e . is_up ( ) & & ! e . is_loopback ( ) & & ! e . ips . is_empty ( ) )
. and_then ( | e | e . ips . first ( ) )
. map ( | a | a . ip ( ) )
}
2022-09-14 14:09:38 +00:00
async fn resolve_peers ( peers : & [ String ] ) -> Vec < ( NodeID , SocketAddr ) > {
let mut ret = vec! [ ] ;
for peer in peers . iter ( ) {
match parse_and_resolve_peer_addr_async ( peer ) . await {
Some ( ( pubkey , addrs ) ) = > {
for ip in addrs {
ret . push ( ( pubkey , ip ) ) ;
}
}
None = > {
warn! ( " Unable to parse and/or resolve peer hostname {} " , peer ) ;
}
}
}
ret
}
2022-12-14 11:57:33 +00:00
2022-12-14 15:11:19 +00:00
fn connect_error_message (
addr : SocketAddr ,
pubkey : ed25519 ::PublicKey ,
e : netapp ::error ::Error ,
) -> String {
format! ( " Error establishing RPC connection to remote node: {} @ {} . \n This can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret. \n {} " , hex ::encode ( pubkey ) , addr , e )
2022-12-14 11:57:33 +00:00
}