2021-10-14 11:50:12 +02:00
//! Module containing structs related to membership management
2023-11-27 12:10:21 +01:00
use std ::collections ::{ HashMap , HashSet } ;
2021-10-14 11:50:12 +02:00
use std ::io ::{ Read , Write } ;
2022-03-06 14:50:00 +01:00
use std ::net ::{ IpAddr , SocketAddr } ;
2022-07-08 13:30:26 +02:00
use std ::path ::{ Path , PathBuf } ;
2023-01-26 15:30:36 +01:00
use std ::sync ::atomic ::Ordering ;
2023-11-09 14:12:05 +01:00
use std ::sync ::{ Arc , RwLock , RwLockReadGuard } ;
2021-10-19 16:16:10 +02:00
use std ::time ::{ Duration , Instant } ;
2021-10-14 11:50:12 +02:00
use arc_swap ::ArcSwap ;
use async_trait ::async_trait ;
2023-09-12 14:35:48 +02:00
use futures ::join ;
2021-10-14 11:50:12 +02:00
use serde ::{ Deserialize , Serialize } ;
use sodiumoxide ::crypto ::sign ::ed25519 ;
2023-09-12 14:35:48 +02:00
use tokio ::select ;
2023-11-09 14:12:05 +01:00
use tokio ::sync ::{ watch , Notify } ;
2021-10-14 11:50:12 +02:00
2021-10-15 11:05:09 +02:00
use netapp ::endpoint ::{ Endpoint , EndpointHandler } ;
2022-07-22 15:20:00 +02:00
use netapp ::message ::* ;
2021-10-14 11:50:12 +02:00
use netapp ::peering ::fullmesh ::FullMeshPeeringStrategy ;
2022-09-14 16:09:38 +02:00
use netapp ::util ::parse_and_resolve_peer_addr_async ;
2021-10-19 16:16:10 +02:00
use netapp ::{ NetApp , NetworkKey , NodeID , NodeKey } ;
2021-10-14 11:50:12 +02:00
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " kubernetes-discovery " ) ]
use garage_util ::config ::KubernetesDiscoveryConfig ;
2023-09-04 14:49:49 +02:00
use garage_util ::config ::{ Config , DataDirEnum } ;
2021-11-09 12:24:04 +01:00
use garage_util ::data ::* ;
2021-10-19 16:16:10 +02:00
use garage_util ::error ::* ;
2021-10-14 11:50:12 +02:00
use garage_util ::persister ::Persister ;
2021-10-15 11:05:09 +02:00
use garage_util ::time ::* ;
2021-10-14 11:50:12 +02:00
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 19:11:16 +02:00
use crate ::consul ::ConsulDiscovery ;
2022-03-16 12:09:50 +01:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-03-06 14:50:00 +01:00
use crate ::kubernetes ::* ;
2023-11-16 13:51:40 +01:00
use crate ::layout ::{
self , manager ::LayoutManager , LayoutDigest , LayoutHelper , LayoutHistory , NodeRoleV ,
} ;
2022-12-05 15:28:57 +01:00
use crate ::replication_mode ::* ;
2021-10-15 11:05:09 +02:00
use crate ::rpc_helper ::* ;
2021-10-14 11:50:12 +02:00
2023-01-09 17:15:55 +00:00
use crate ::system_metrics ::* ;
2021-10-14 11:50:12 +02:00
const DISCOVERY_INTERVAL : Duration = Duration ::from_secs ( 60 ) ;
2021-10-15 11:05:09 +02:00
const STATUS_EXCHANGE_INTERVAL : Duration = Duration ::from_secs ( 10 ) ;
2021-10-14 11:50:12 +02:00
2022-09-07 11:59:56 +02:00
/// Version tag used for version check upon Netapp connection.
/// Cluster nodes with different version tags are deemed
/// incompatible and will refuse to connect.
2023-11-08 19:30:58 +01:00
pub const GARAGE_VERSION_TAG : u64 = 0x676172616765000A ; // garage 0x000A
2022-02-18 20:39:55 +01:00
2021-10-14 11:50:12 +02:00
/// RPC endpoint used for calls related to membership
2023-11-09 12:55:36 +01:00
pub const SYSTEM_RPC_PATH : & str = " garage_rpc/system.rs/SystemRpc " ;
2021-10-14 11:50:12 +02:00
/// RPC messages related to membership
#[ derive(Debug, Serialize, Deserialize, Clone) ]
pub enum SystemRpc {
/// Response to successfull advertisements
Ok ,
2021-10-15 11:05:09 +02:00
/// Request to connect to a specific node (in <pubkey>@<host>:<port> format)
Connect ( String ) ,
2021-10-14 11:50:12 +02:00
/// Advertise Garage status. Answered with another AdvertiseStatus.
/// Exchanged with every node on a regular basis.
2021-10-15 11:05:09 +02:00
AdvertiseStatus ( NodeStatus ) ,
2021-10-14 11:50:12 +02:00
/// Get known nodes states
GetKnownNodes ,
/// Return known nodes
2021-10-15 11:05:09 +02:00
ReturnKnownNodes ( Vec < KnownNodeInfo > ) ,
2023-11-09 14:53:34 +01:00
2023-11-09 12:55:36 +01:00
/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
PullClusterLayout ,
/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
2023-11-09 14:12:05 +01:00
AdvertiseClusterLayout ( LayoutHistory ) ,
2023-11-09 14:53:34 +01:00
/// Ask other node its cluster layout update trackers.
PullClusterLayoutTrackers ,
/// Advertisement of cluster layout update trackers.
AdvertiseClusterLayoutTrackers ( layout ::UpdateTrackers ) ,
2021-10-14 11:50:12 +02:00
}
2021-10-15 11:05:09 +02:00
impl Rpc for SystemRpc {
type Response = Result < SystemRpc , Error > ;
2021-10-14 11:50:12 +02:00
}
2023-01-03 14:44:47 +01:00
#[ derive(Serialize, Deserialize) ]
pub struct PeerList ( Vec < ( Uuid , SocketAddr ) > ) ;
impl garage_util ::migrate ::InitialFormat for PeerList { }
2021-10-14 11:50:12 +02:00
/// This node's membership manager
pub struct System {
/// The id of this node
2021-10-15 11:05:09 +02:00
pub id : Uuid ,
2021-10-14 11:50:12 +02:00
2023-01-03 14:44:47 +01:00
persist_peer_list : Persister < PeerList > ,
2021-10-14 11:50:12 +02:00
2021-10-15 11:05:09 +02:00
local_status : ArcSwap < NodeStatus > ,
node_status : RwLock < HashMap < Uuid , ( u64 , NodeStatus ) > > ,
2021-10-14 11:50:12 +02:00
pub netapp : Arc < NetApp > ,
fullmesh : Arc < FullMeshPeeringStrategy > ,
2023-11-09 12:55:36 +01:00
pub ( crate ) system_endpoint : Arc < Endpoint < SystemRpc , System > > ,
2021-10-14 11:50:12 +02:00
rpc_listen_addr : SocketAddr ,
2022-10-18 18:38:20 +02:00
#[ cfg(any(feature = " consul-discovery " , feature = " kubernetes-discovery " )) ]
2021-10-15 11:05:09 +02:00
rpc_public_addr : Option < SocketAddr > ,
2022-09-14 16:09:38 +02:00
bootstrap_peers : Vec < String > ,
2022-03-06 14:50:00 +01:00
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 19:11:16 +02:00
consul_discovery : Option < ConsulDiscovery > ,
2022-03-16 12:09:50 +01:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-10-18 18:38:20 +02:00
kubernetes_discovery : Option < KubernetesDiscoveryConfig > ,
2022-03-06 14:50:00 +01:00
2023-11-09 13:34:14 +01:00
pub layout_manager : Arc < LayoutManager > ,
2023-11-09 12:55:36 +01:00
2023-01-09 17:15:55 +00:00
metrics : SystemMetrics ,
2022-03-06 14:50:00 +01:00
2022-12-05 15:28:57 +01:00
replication_mode : ReplicationMode ,
2021-10-14 11:50:12 +02:00
replication_factor : usize ,
2022-07-08 13:30:26 +02:00
/// Path to metadata directory
pub metadata_dir : PathBuf ,
2023-01-26 15:04:32 +01:00
/// Path to data directory
2023-09-04 14:49:49 +02:00
pub data_dir : DataDirEnum ,
2021-10-14 11:50:12 +02:00
}
#[ derive(Debug, Clone, Serialize, Deserialize) ]
2021-10-15 11:05:09 +02:00
pub struct NodeStatus {
2021-10-14 11:50:12 +02:00
/// Hostname of the node
2023-11-28 14:25:04 +01:00
pub hostname : Option < String > ,
2023-01-26 15:04:32 +01:00
2021-10-14 11:50:12 +02:00
/// Replication factor configured on the node
pub replication_factor : usize ,
2023-11-09 12:55:36 +01:00
2023-11-16 13:51:40 +01:00
/// Cluster layout digest
pub layout_digest : LayoutDigest ,
2023-01-26 15:04:32 +01:00
/// Disk usage on partition containing metadata directory (tuple: `(avail, total)`)
#[ serde(default) ]
pub meta_disk_avail : Option < ( u64 , u64 ) > ,
/// Disk usage on partition containing data directory (tuple: `(avail, total)`)
#[ serde(default) ]
pub data_disk_avail : Option < ( u64 , u64 ) > ,
2021-10-14 11:50:12 +02:00
}
2021-10-15 11:05:09 +02:00
#[ derive(Debug, Clone, Serialize, Deserialize) ]
pub struct KnownNodeInfo {
pub id : Uuid ,
pub addr : SocketAddr ,
pub is_up : bool ,
2021-10-19 16:16:10 +02:00
pub last_seen_secs_ago : Option < u64 > ,
2021-10-15 11:05:09 +02:00
pub status : NodeStatus ,
}
2023-06-14 13:53:19 +02:00
#[ derive(Debug, Clone, Copy) ]
2022-12-05 15:28:57 +01:00
pub struct ClusterHealth {
/// The current health status of the cluster (see below)
pub status : ClusterHealthStatus ,
/// Number of nodes already seen once in the cluster
pub known_nodes : usize ,
/// Number of nodes currently connected
pub connected_nodes : usize ,
/// Number of storage nodes declared in the current layout
pub storage_nodes : usize ,
/// Number of storage nodes currently connected
pub storage_nodes_ok : usize ,
/// Number of partitions in the layout
pub partitions : usize ,
/// Number of partitions for which we have a quorum of connected nodes
pub partitions_quorum : usize ,
/// Number of partitions for which all storage nodes are connected
pub partitions_all_ok : usize ,
}
2023-06-14 13:53:19 +02:00
#[ derive(Debug, Clone, Copy) ]
2022-12-05 15:28:57 +01:00
pub enum ClusterHealthStatus {
/// All nodes are available
Healthy ,
/// Some storage nodes are unavailable, but quorum is stil
/// achieved for all partitions
Degraded ,
/// Quorum is not available for some partitions
Unavailable ,
}
2021-10-19 16:16:10 +02:00
pub fn read_node_id ( metadata_dir : & Path ) -> Result < NodeID , Error > {
let mut pubkey_file = metadata_dir . to_path_buf ( ) ;
pubkey_file . push ( " node_key.pub " ) ;
let mut f = std ::fs ::File ::open ( pubkey_file . as_path ( ) ) ? ;
let mut d = vec! [ ] ;
f . read_to_end ( & mut d ) ? ;
if d . len ( ) ! = 32 {
return Err ( Error ::Message ( " Corrupt node_key.pub file " . to_string ( ) ) ) ;
}
let mut key = [ 0 u8 ; 32 ] ;
key . copy_from_slice ( & d [ .. ] ) ;
Ok ( NodeID ::from_slice ( & key [ .. ] ) . unwrap ( ) )
}
pub fn gen_node_key ( metadata_dir : & Path ) -> Result < NodeKey , Error > {
2021-10-15 11:05:09 +02:00
let mut key_file = metadata_dir . to_path_buf ( ) ;
key_file . push ( " node_key " ) ;
if key_file . as_path ( ) . exists ( ) {
let mut f = std ::fs ::File ::open ( key_file . as_path ( ) ) ? ;
2021-10-14 11:50:12 +02:00
let mut d = vec! [ ] ;
f . read_to_end ( & mut d ) ? ;
if d . len ( ) ! = 64 {
2021-10-15 11:05:09 +02:00
return Err ( Error ::Message ( " Corrupt node_key file " . to_string ( ) ) ) ;
2021-10-14 11:50:12 +02:00
}
let mut key = [ 0 u8 ; 64 ] ;
key . copy_from_slice ( & d [ .. ] ) ;
Ok ( NodeKey ::from_slice ( & key [ .. ] ) . unwrap ( ) )
} else {
2021-10-19 16:16:10 +02:00
if ! metadata_dir . exists ( ) {
info! ( " Metadata directory does not exist, creating it. " ) ;
2023-01-26 17:26:32 +01:00
std ::fs ::create_dir ( metadata_dir ) ? ;
2021-10-19 16:16:10 +02:00
}
info! ( " Generating new node key pair. " ) ;
let ( pubkey , key ) = ed25519 ::gen_keypair ( ) ;
{
use std ::os ::unix ::fs ::PermissionsExt ;
let mut f = std ::fs ::File ::create ( key_file . as_path ( ) ) ? ;
let mut perm = f . metadata ( ) ? . permissions ( ) ;
perm . set_mode ( 0o600 ) ;
std ::fs ::set_permissions ( key_file . as_path ( ) , perm ) ? ;
f . write_all ( & key [ .. ] ) ? ;
}
{
let mut pubkey_file = metadata_dir . to_path_buf ( ) ;
pubkey_file . push ( " node_key.pub " ) ;
let mut f2 = std ::fs ::File ::create ( pubkey_file . as_path ( ) ) ? ;
f2 . write_all ( & pubkey [ .. ] ) ? ;
}
2021-10-14 11:50:12 +02:00
2021-10-15 11:05:09 +02:00
Ok ( key )
2021-10-14 11:50:12 +02:00
}
}
impl System {
/// Create this node's membership manager
pub fn new (
network_key : NetworkKey ,
2022-12-05 15:28:57 +01:00
replication_mode : ReplicationMode ,
2021-10-19 16:16:10 +02:00
config : & Config ,
2022-09-13 16:22:23 +02:00
) -> Result < Arc < Self > , Error > {
2023-11-09 12:55:36 +01:00
// ---- setup netapp RPC protocol ----
2021-10-19 16:16:10 +02:00
let node_key =
gen_node_key ( & config . metadata_dir ) . expect ( " Unable to read or generate node ID " ) ;
2022-02-18 20:39:55 +01:00
info! (
" Node ID of this node: {} " ,
hex ::encode ( & node_key . public_key ( ) [ .. 8 ] )
) ;
2021-10-14 11:50:12 +02:00
2023-11-09 12:55:36 +01:00
let netapp = NetApp ::new ( GARAGE_VERSION_TAG , network_key , node_key ) ;
let system_endpoint = netapp . endpoint ( SYSTEM_RPC_PATH . into ( ) ) ;
2021-10-14 11:50:12 +02:00
2023-11-09 12:55:36 +01:00
// ---- setup netapp public listener and full mesh peering strategy ----
let rpc_public_addr = get_rpc_public_addr ( config ) ;
2022-09-14 16:09:38 +02:00
if rpc_public_addr . is_none ( ) {
warn! ( " This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication. " ) ;
}
2022-03-16 12:09:50 +01:00
2022-09-14 16:09:38 +02:00
let fullmesh = FullMeshPeeringStrategy ::new ( netapp . clone ( ) , vec! [ ] , rpc_public_addr ) ;
2022-09-19 20:12:19 +02:00
if let Some ( ping_timeout ) = config . rpc_ping_timeout_msec {
fullmesh . set_ping_timeout_millis ( ping_timeout ) ;
}
2021-10-14 11:50:12 +02:00
2023-11-09 12:55:36 +01:00
let persist_peer_list = Persister ::new ( & config . metadata_dir , " peer_list " ) ;
2021-10-14 11:50:12 +02:00
2023-11-09 12:55:36 +01:00
// ---- setup cluster layout and layout manager ----
let replication_factor = replication_mode . replication_factor ( ) ;
let layout_manager = LayoutManager ::new (
config ,
netapp . id ,
system_endpoint . clone ( ) ,
fullmesh . clone ( ) ,
2023-12-07 14:27:53 +01:00
replication_mode ,
2023-11-09 12:55:36 +01:00
) ? ;
// ---- set up metrics and status exchange ----
let metrics = SystemMetrics ::new ( replication_factor ) ;
2023-11-09 13:34:14 +01:00
let mut local_status = NodeStatus ::initial ( replication_factor , & layout_manager ) ;
2023-11-09 12:55:36 +01:00
local_status . update_disk_usage ( & config . metadata_dir , & config . data_dir , & metrics ) ;
// ---- if enabled, set up additionnal peer discovery methods ----
2022-10-18 19:13:52 +02:00
#[ cfg(feature = " consul-discovery " ) ]
let consul_discovery = match & config . consul_discovery {
Some ( cfg ) = > Some (
ConsulDiscovery ::new ( cfg . clone ( ) )
. ok_or_message ( " Invalid Consul discovery configuration " ) ? ,
) ,
None = > None ,
} ;
2022-10-18 18:38:20 +02:00
#[ cfg(not(feature = " consul-discovery " )) ]
if config . consul_discovery . is_some ( ) {
warn! ( " Consul discovery is not enabled in this build. " ) ;
}
2022-03-16 12:09:50 +01:00
#[ cfg(not(feature = " kubernetes-discovery " )) ]
2022-10-18 18:38:20 +02:00
if config . kubernetes_discovery . is_some ( ) {
2022-03-16 12:09:50 +01:00
warn! ( " Kubernetes discovery is not enabled in this build. " ) ;
}
2023-11-09 12:55:36 +01:00
// ---- done ----
2021-10-14 11:50:12 +02:00
let sys = Arc ::new ( System {
2021-10-15 11:05:09 +02:00
id : netapp . id . into ( ) ,
persist_peer_list ,
local_status : ArcSwap ::new ( Arc ::new ( local_status ) ) ,
node_status : RwLock ::new ( HashMap ::new ( ) ) ,
2021-10-14 11:50:12 +02:00
netapp : netapp . clone ( ) ,
2023-12-11 15:31:47 +01:00
fullmesh ,
2021-10-14 11:50:12 +02:00
system_endpoint ,
2022-12-05 15:28:57 +01:00
replication_mode ,
2021-10-14 11:50:12 +02:00
replication_factor ,
2021-10-19 16:16:10 +02:00
rpc_listen_addr : config . rpc_bind_addr ,
2022-10-18 18:38:20 +02:00
#[ cfg(any(feature = " consul-discovery " , feature = " kubernetes-discovery " )) ]
2022-03-16 12:09:50 +01:00
rpc_public_addr ,
2021-10-19 16:16:10 +02:00
bootstrap_peers : config . bootstrap_peers . clone ( ) ,
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 19:11:16 +02:00
consul_discovery ,
2022-03-16 12:09:50 +01:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-10-18 18:38:20 +02:00
kubernetes_discovery : config . kubernetes_discovery . clone ( ) ,
2023-11-09 12:55:36 +01:00
layout_manager ,
2023-01-09 17:15:55 +00:00
metrics ,
2022-03-06 14:50:00 +01:00
2022-07-08 13:30:26 +02:00
metadata_dir : config . metadata_dir . clone ( ) ,
2023-01-26 15:04:32 +01:00
data_dir : config . data_dir . clone ( ) ,
2021-10-14 11:50:12 +02:00
} ) ;
sys . system_endpoint . set_handler ( sys . clone ( ) ) ;
2022-09-13 16:22:23 +02:00
Ok ( sys )
2021-10-14 11:50:12 +02:00
}
/// Perform bootstraping, starting the ping loop
pub async fn run ( self : Arc < Self > , must_exit : watch ::Receiver < bool > ) {
join! (
self . netapp
. clone ( )
. listen ( self . rpc_listen_addr , None , must_exit . clone ( ) ) ,
self . fullmesh . clone ( ) . run ( must_exit . clone ( ) ) ,
self . discovery_loop ( must_exit . clone ( ) ) ,
2021-10-15 11:05:09 +02:00
self . status_exchange_loop ( must_exit . clone ( ) ) ,
2021-10-14 11:50:12 +02:00
) ;
}
2023-11-09 12:55:36 +01:00
// ---- Public utilities / accessors ----
2023-11-15 14:20:50 +01:00
pub fn cluster_layout ( & self ) -> RwLockReadGuard < '_ , LayoutHelper > {
2023-11-09 13:34:14 +01:00
self . layout_manager . layout ( )
2023-11-09 12:55:36 +01:00
}
2023-11-09 14:12:05 +01:00
pub fn layout_notify ( & self ) -> Arc < Notify > {
self . layout_manager . change_notify . clone ( )
2023-11-09 12:55:36 +01:00
}
pub fn rpc_helper ( & self ) -> & RpcHelper {
& self . layout_manager . rpc_helper
}
2022-05-24 12:16:39 +02:00
// ---- Administrative operations (directly available and
// also available through RPC) ----
pub fn get_known_nodes ( & self ) -> Vec < KnownNodeInfo > {
let node_status = self . node_status . read ( ) . unwrap ( ) ;
let known_nodes = self
. fullmesh
. get_peer_list ( )
. iter ( )
. map ( | n | KnownNodeInfo {
id : n . id . into ( ) ,
addr : n . addr ,
is_up : n . is_up ( ) ,
2022-09-29 15:53:54 +02:00
last_seen_secs_ago : n
. last_seen
. map ( | t | ( Instant ::now ( ) . saturating_duration_since ( t ) ) . as_secs ( ) ) ,
2022-05-24 12:16:39 +02:00
status : node_status
. get ( & n . id . into ( ) )
. cloned ( )
. map ( | ( _ , st ) | st )
2023-01-26 17:26:32 +01:00
. unwrap_or_else ( NodeStatus ::unknown ) ,
2022-05-24 12:16:39 +02:00
} )
. collect ::< Vec < _ > > ( ) ;
known_nodes
}
pub async fn connect ( & self , node : & str ) -> Result < ( ) , Error > {
2022-09-14 16:09:38 +02:00
let ( pubkey , addrs ) = parse_and_resolve_peer_addr_async ( node )
. await
. ok_or_else ( | | {
Error ::Message ( format! (
" Unable to parse or resolve node specification: {} " ,
node
) )
} ) ? ;
2022-05-24 12:16:39 +02:00
let mut errors = vec! [ ] ;
2022-12-14 12:57:33 +01:00
for addr in addrs . iter ( ) {
2022-12-14 16:11:19 +01:00
match self . netapp . clone ( ) . try_connect ( * addr , pubkey ) . await {
2022-05-24 12:16:39 +02:00
Ok ( ( ) ) = > return Ok ( ( ) ) ,
Err ( e ) = > {
2022-12-14 16:11:19 +01:00
errors . push ( (
* addr ,
Error ::Message ( connect_error_message ( * addr , pubkey , e ) ) ,
) ) ;
2022-05-24 12:16:39 +02:00
}
}
}
if errors . len ( ) = = 1 {
Err ( Error ::Message ( errors [ 0 ] . 1. to_string ( ) ) )
} else {
Err ( Error ::Message ( format! ( " {:?} " , errors ) ) )
}
}
2022-12-05 15:28:57 +01:00
pub fn health ( & self ) -> ClusterHealth {
let quorum = self . replication_mode . write_quorum ( ) ;
2023-11-27 12:10:21 +01:00
// Gather information about running nodes.
// Technically, `nodes` contains currently running nodes, as well
// as nodes that this Garage process has been connected to at least
// once since it started.
2022-12-05 15:28:57 +01:00
let nodes = self
. get_known_nodes ( )
. into_iter ( )
. map ( | n | ( n . id , n ) )
. collect ::< HashMap < Uuid , _ > > ( ) ;
let connected_nodes = nodes . iter ( ) . filter ( | ( _ , n ) | n . is_up ) . count ( ) ;
2023-11-27 12:10:21 +01:00
let node_up = | x : & Uuid | nodes . get ( x ) . map ( | n | n . is_up ) . unwrap_or ( false ) ;
// Acquire a rwlock read-lock to the current cluster layout
let layout = self . cluster_layout ( ) ;
// Obtain information about nodes that have a role as storage nodes
// in one of the active layout versions
let mut storage_nodes = HashSet ::< Uuid > ::with_capacity ( 16 ) ;
for ver in layout . versions . iter ( ) {
storage_nodes . extend (
ver . roles
. items ( )
. iter ( )
. filter ( | ( _ , _ , v ) | matches! ( v , NodeRoleV ( Some ( r ) ) if r . capacity . is_some ( ) ) )
. map ( | ( n , _ , _ ) | * n ) ,
)
}
let storage_nodes_ok = storage_nodes . iter ( ) . filter ( | x | node_up ( x ) ) . count ( ) ;
2022-12-05 15:28:57 +01:00
2023-11-27 12:10:21 +01:00
// Determine the number of partitions that have:
// - a quorum of up nodes for all write sets (i.e. are available)
// - for which all nodes in all write sets are up (i.e. are fully healthy)
2023-11-11 12:08:32 +01:00
let partitions = layout . current ( ) . partitions ( ) . collect ::< Vec < _ > > ( ) ;
2023-11-27 12:10:21 +01:00
let mut partitions_quorum = 0 ;
let mut partitions_all_ok = 0 ;
for ( _ , hash ) in partitions . iter ( ) {
2023-12-08 10:36:37 +01:00
let mut write_sets = layout
2023-11-27 12:10:21 +01:00
. versions
. iter ( )
. map ( | x | x . nodes_of ( hash , x . replication_factor ) ) ;
let has_quorum = write_sets
. clone ( )
. all ( | set | set . filter ( | x | node_up ( x ) ) . count ( ) > = quorum ) ;
2023-12-08 10:36:37 +01:00
let all_ok = write_sets . all ( | mut set | set . all ( | x | node_up ( & x ) ) ) ;
2023-11-27 12:10:21 +01:00
if has_quorum {
partitions_quorum + = 1 ;
}
if all_ok {
partitions_all_ok + = 1 ;
}
}
2022-12-05 15:28:57 +01:00
2023-11-27 12:10:21 +01:00
// Determine overall cluster status
2022-12-05 15:28:57 +01:00
let status =
2023-12-08 10:36:37 +01:00
if partitions_all_ok = = partitions . len ( ) & & storage_nodes_ok = = storage_nodes . len ( ) {
2022-12-05 15:28:57 +01:00
ClusterHealthStatus ::Healthy
} else if partitions_quorum = = partitions . len ( ) {
ClusterHealthStatus ::Degraded
} else {
ClusterHealthStatus ::Unavailable
} ;
ClusterHealth {
status ,
known_nodes : nodes . len ( ) ,
connected_nodes ,
storage_nodes : storage_nodes . len ( ) ,
storage_nodes_ok ,
partitions : partitions . len ( ) ,
partitions_quorum ,
partitions_all_ok ,
}
}
2021-10-14 11:50:12 +02:00
// ---- INTERNALS ----
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " consul-discovery " ) ]
2023-01-03 16:55:59 +01:00
async fn advertise_to_consul ( self : Arc < Self > ) {
2022-03-16 12:09:50 +01:00
let c = match & self . consul_discovery {
Some ( c ) = > c ,
2023-01-03 16:55:59 +01:00
_ = > return ,
2022-03-16 12:09:50 +01:00
} ;
2021-10-15 11:05:09 +02:00
let rpc_public_addr = match self . rpc_public_addr {
Some ( addr ) = > addr ,
None = > {
2022-03-16 12:09:50 +01:00
warn! ( " Not advertising to Consul because rpc_public_addr is not defined in config file and could not be autodetected. " ) ;
2023-01-03 16:55:59 +01:00
return ;
2021-10-15 11:05:09 +02:00
}
} ;
2023-01-03 16:55:59 +01:00
if let Err ( e ) = c
. publish_consul_service (
self . netapp . id ,
2023-12-07 15:15:59 +01:00
& self . local_status . load_full ( ) . hostname . as_deref ( ) . unwrap ( ) ,
2023-01-03 16:55:59 +01:00
rpc_public_addr ,
)
. await
{
error! ( " Error while publishing Consul service: {} " , e ) ;
}
2021-10-15 11:05:09 +02:00
}
2022-03-16 12:09:50 +01:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2023-01-03 16:55:59 +01:00
async fn advertise_to_kubernetes ( self : Arc < Self > ) {
2022-03-16 12:09:50 +01:00
let k = match & self . kubernetes_discovery {
Some ( k ) = > k ,
2023-01-03 16:55:59 +01:00
_ = > return ,
2022-03-16 12:09:50 +01:00
} ;
let rpc_public_addr = match self . rpc_public_addr {
Some ( addr ) = > addr ,
None = > {
warn! ( " Not advertising to Kubernetes because rpc_public_addr is not defined in config file and could not be autodetected. " ) ;
2023-01-03 16:55:59 +01:00
return ;
2022-03-16 12:09:50 +01:00
}
} ;
2022-03-06 14:50:00 +01:00
2023-01-03 16:55:59 +01:00
if let Err ( e ) = publish_kubernetes_node (
2022-10-18 18:38:20 +02:00
k ,
2022-03-06 14:50:00 +01:00
self . netapp . id ,
2023-12-07 15:15:59 +01:00
& self . local_status . load_full ( ) . hostname . as_deref ( ) . unwrap ( ) ,
2022-03-06 14:50:00 +01:00
rpc_public_addr ,
)
. await
2023-01-03 16:55:59 +01:00
{
error! ( " Error while publishing node to Kubernetes: {} " , e ) ;
}
2022-03-06 14:50:00 +01:00
}
2021-10-15 11:05:09 +02:00
fn update_local_status ( & self ) {
let mut new_si : NodeStatus = self . local_status . load ( ) . as_ref ( ) . clone ( ) ;
2021-10-14 11:50:12 +02:00
2023-11-16 13:51:40 +01:00
new_si . layout_digest = self . layout_manager . layout ( ) . digest ( ) ;
2023-01-26 15:04:32 +01:00
2023-01-26 15:30:36 +01:00
new_si . update_disk_usage ( & self . metadata_dir , & self . data_dir , & self . metrics ) ;
2023-01-26 15:04:32 +01:00
2021-10-15 11:05:09 +02:00
self . local_status . swap ( Arc ::new ( new_si ) ) ;
}
2022-05-24 12:16:39 +02:00
// --- RPC HANDLERS ---
2021-10-15 11:05:09 +02:00
async fn handle_connect ( & self , node : & str ) -> Result < SystemRpc , Error > {
2022-05-24 12:16:39 +02:00
self . connect ( node ) . await ? ;
Ok ( SystemRpc ::Ok )
2021-10-14 11:50:12 +02:00
}
2021-10-15 11:05:09 +02:00
fn handle_get_known_nodes ( & self ) -> SystemRpc {
2022-05-24 12:16:39 +02:00
let known_nodes = self . get_known_nodes ( ) ;
2021-10-15 11:05:09 +02:00
SystemRpc ::ReturnKnownNodes ( known_nodes )
}
async fn handle_advertise_status (
self : & Arc < Self > ,
from : Uuid ,
info : & NodeStatus ,
) -> Result < SystemRpc , Error > {
let local_info = self . local_status . load ( ) ;
if local_info . replication_factor < info . replication_factor {
2022-09-13 16:22:23 +02:00
error! ( " Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety. " ,
2021-10-15 11:05:09 +02:00
info . replication_factor ,
local_info . replication_factor ) ;
std ::process ::exit ( 1 ) ;
}
2023-11-09 13:34:14 +01:00
self . layout_manager
2023-11-16 13:51:40 +01:00
. handle_advertise_status ( from , & info . layout_digest ) ;
2021-10-15 11:05:09 +02:00
self . node_status
. write ( )
. unwrap ( )
. insert ( from , ( now_msec ( ) , info . clone ( ) ) ) ;
Ok ( SystemRpc ::Ok )
}
async fn status_exchange_loop ( & self , mut stop_signal : watch ::Receiver < bool > ) {
while ! * stop_signal . borrow ( ) {
2023-09-12 14:35:48 +02:00
let restart_at = Instant ::now ( ) + STATUS_EXCHANGE_INTERVAL ;
2021-10-15 11:05:09 +02:00
self . update_local_status ( ) ;
let local_status : NodeStatus = self . local_status . load ( ) . as_ref ( ) . clone ( ) ;
2022-07-22 15:20:00 +02:00
let _ = self
2023-11-09 12:55:36 +01:00
. rpc_helper ( )
2021-10-15 11:05:09 +02:00
. broadcast (
& self . system_endpoint ,
SystemRpc ::AdvertiseStatus ( local_status ) ,
2023-09-12 14:35:48 +02:00
RequestStrategy ::with_priority ( PRIO_HIGH )
. with_custom_timeout ( STATUS_EXCHANGE_INTERVAL ) ,
2021-10-15 11:05:09 +02:00
)
. await ;
select! {
2023-09-12 14:35:48 +02:00
_ = tokio ::time ::sleep_until ( restart_at . into ( ) ) = > { } ,
_ = stop_signal . changed ( ) = > { } ,
2021-10-15 11:05:09 +02:00
}
}
}
async fn discovery_loop ( self : & Arc < Self > , mut stop_signal : watch ::Receiver < bool > ) {
2021-10-14 11:50:12 +02:00
while ! * stop_signal . borrow ( ) {
2023-11-09 12:55:36 +01:00
let not_configured = self . cluster_layout ( ) . check ( ) . is_err ( ) ;
2021-10-14 11:50:12 +02:00
let no_peers = self . fullmesh . get_peer_list ( ) . len ( ) < self . replication_factor ;
2023-11-14 13:06:16 +01:00
let expected_n_nodes = self . cluster_layout ( ) . all_nodes ( ) . len ( ) ;
2021-10-14 11:50:12 +02:00
let bad_peers = self
. fullmesh
. get_peer_list ( )
. iter ( )
. filter ( | p | p . is_up ( ) )
2021-11-09 12:24:04 +01:00
. count ( ) ! = expected_n_nodes ;
2021-10-14 11:50:12 +02:00
if not_configured | | no_peers | | bad_peers {
info! ( " Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {}) " , not_configured , no_peers , bad_peers ) ;
2022-09-14 16:09:38 +02:00
let mut ping_list = resolve_peers ( & self . bootstrap_peers ) . await ;
2021-10-14 11:50:12 +02:00
2021-10-15 11:05:09 +02:00
// Add peer list from list stored on disk
if let Ok ( peers ) = self . persist_peer_list . load_async ( ) . await {
2023-01-03 14:44:47 +01:00
ping_list . extend ( peers . 0. iter ( ) . map ( | ( id , addr ) | ( ( * id ) . into ( ) , * addr ) ) )
2021-10-14 11:50:12 +02:00
}
2021-10-15 11:05:09 +02:00
// Fetch peer list from Consul
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " consul-discovery " ) ]
2022-03-16 12:09:50 +01:00
if let Some ( c ) = & self . consul_discovery {
2022-10-18 19:11:16 +02:00
match c . get_consul_nodes ( ) . await {
2021-10-14 11:50:12 +02:00
Ok ( node_list ) = > {
2021-10-15 11:05:09 +02:00
ping_list . extend ( node_list ) ;
2021-10-14 11:50:12 +02:00
}
Err ( e ) = > {
warn! ( " Could not retrieve node list from Consul: {} " , e ) ;
}
}
}
2022-03-06 14:50:00 +01:00
// Fetch peer list from Kubernetes
2022-03-16 12:09:50 +01:00
#[ cfg(feature = " kubernetes-discovery " ) ]
if let Some ( k ) = & self . kubernetes_discovery {
if ! k . skip_crd {
2022-03-06 14:50:00 +01:00
match create_kubernetes_crd ( ) . await {
Ok ( ( ) ) = > ( ) ,
Err ( e ) = > {
error! ( " Failed to create kubernetes custom resource: {} " , e )
}
} ;
}
2022-10-18 18:38:20 +02:00
match get_kubernetes_nodes ( k ) . await {
2022-03-06 14:50:00 +01:00
Ok ( node_list ) = > {
ping_list . extend ( node_list ) ;
}
Err ( e ) = > {
warn! ( " Could not retrieve node list from Kubernetes: {} " , e ) ;
}
}
}
2021-10-14 11:50:12 +02:00
for ( node_id , node_addr ) in ping_list {
2022-12-14 16:08:05 +01:00
let self2 = self . clone ( ) ;
tokio ::spawn ( async move {
if let Err ( e ) = self2 . netapp . clone ( ) . try_connect ( node_addr , node_id ) . await {
2022-12-14 16:11:19 +01:00
error! ( " {} " , connect_error_message ( node_addr , node_id , e ) ) ;
2022-12-14 16:08:05 +01:00
}
} ) ;
2021-10-14 11:50:12 +02:00
}
}
2021-11-03 17:34:44 +01:00
if let Err ( e ) = self . save_peer_list ( ) . await {
2021-10-15 11:05:09 +02:00
warn! ( " Could not save peer list to file: {} " , e ) ;
}
2022-10-18 18:38:20 +02:00
#[ cfg(feature = " consul-discovery " ) ]
2023-01-03 16:55:59 +01:00
tokio ::spawn ( self . clone ( ) . advertise_to_consul ( ) ) ;
2022-03-16 12:09:50 +01:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2023-01-03 16:55:59 +01:00
tokio ::spawn ( self . clone ( ) . advertise_to_kubernetes ( ) ) ;
2021-10-15 11:05:09 +02:00
2021-10-14 11:50:12 +02:00
select! {
2023-09-12 14:35:48 +02:00
_ = tokio ::time ::sleep ( DISCOVERY_INTERVAL ) = > { } ,
_ = stop_signal . changed ( ) = > { } ,
2021-10-14 11:50:12 +02:00
}
}
}
2021-11-03 17:34:44 +01:00
async fn save_peer_list ( & self ) -> Result < ( ) , Error > {
// Prepare new peer list to save to file
// It is a vec of tuples (node ID as Uuid, node SocketAddr)
let mut peer_list = self
. fullmesh
. get_peer_list ( )
. iter ( )
. map ( | n | ( n . id . into ( ) , n . addr ) )
. collect ::< Vec < _ > > ( ) ;
// Before doing it, we read the current peer list file (if it exists)
// and append it to the list we are about to save,
// so that no peer ID gets lost in the process.
if let Ok ( mut prev_peer_list ) = self . persist_peer_list . load_async ( ) . await {
2023-01-03 14:44:47 +01:00
prev_peer_list
. 0
. retain ( | ( id , _ip ) | peer_list . iter ( ) . all ( | ( id2 , _ip2 ) | id2 ! = id ) ) ;
peer_list . extend ( prev_peer_list . 0 ) ;
2021-11-03 17:34:44 +01:00
}
// Save new peer list to file
2023-01-03 14:44:47 +01:00
self . persist_peer_list
. save_async ( & PeerList ( peer_list ) )
. await
2021-11-03 17:34:44 +01:00
}
2021-10-14 11:50:12 +02:00
}
#[ async_trait ]
impl EndpointHandler < SystemRpc > for System {
2021-10-15 11:05:09 +02:00
async fn handle ( self : & Arc < Self > , msg : & SystemRpc , from : NodeID ) -> Result < SystemRpc , Error > {
match msg {
2023-11-09 12:55:36 +01:00
// ---- system functions -> System ----
2021-10-15 11:05:09 +02:00
SystemRpc ::Connect ( node ) = > self . handle_connect ( node ) . await ,
SystemRpc ::AdvertiseStatus ( adv ) = > self . handle_advertise_status ( from . into ( ) , adv ) . await ,
2023-11-09 12:55:36 +01:00
SystemRpc ::GetKnownNodes = > Ok ( self . handle_get_known_nodes ( ) ) ,
// ---- layout functions -> LayoutManager ----
SystemRpc ::PullClusterLayout = > Ok ( self . layout_manager . handle_pull_cluster_layout ( ) ) ,
2021-11-09 12:24:04 +01:00
SystemRpc ::AdvertiseClusterLayout ( adv ) = > {
2023-11-09 12:55:36 +01:00
self . layout_manager
. handle_advertise_cluster_layout ( adv )
. await
2021-11-09 12:24:04 +01:00
}
2023-11-09 14:53:34 +01:00
SystemRpc ::PullClusterLayoutTrackers = > {
Ok ( self . layout_manager . handle_pull_cluster_layout_trackers ( ) )
}
SystemRpc ::AdvertiseClusterLayoutTrackers ( adv ) = > {
self . layout_manager
. handle_advertise_cluster_layout_trackers ( adv )
. await
}
2023-11-09 12:55:36 +01:00
// ---- other -> Error ----
2022-01-03 13:58:05 +01:00
m = > Err ( Error ::unexpected_rpc_message ( m ) ) ,
2021-10-14 11:50:12 +02:00
}
}
}
2022-03-16 12:09:50 +01:00
2023-01-26 15:04:32 +01:00
impl NodeStatus {
2023-11-09 13:34:14 +01:00
fn initial ( replication_factor : usize , layout_manager : & LayoutManager ) -> Self {
2023-01-26 15:04:32 +01:00
NodeStatus {
2023-11-28 14:25:04 +01:00
hostname : Some (
gethostname ::gethostname ( )
. into_string ( )
. unwrap_or_else ( | _ | " <invalid utf-8> " . to_string ( ) ) ,
) ,
2023-01-26 15:04:32 +01:00
replication_factor ,
2023-11-16 13:51:40 +01:00
layout_digest : layout_manager . layout ( ) . digest ( ) ,
2023-01-26 15:04:32 +01:00
meta_disk_avail : None ,
data_disk_avail : None ,
}
}
fn unknown ( ) -> Self {
NodeStatus {
2023-11-28 14:25:04 +01:00
hostname : None ,
2023-01-26 15:04:32 +01:00
replication_factor : 0 ,
2023-11-16 13:51:40 +01:00
layout_digest : Default ::default ( ) ,
2023-01-26 15:04:32 +01:00
meta_disk_avail : None ,
data_disk_avail : None ,
}
}
2023-09-04 14:49:49 +02:00
fn update_disk_usage (
& mut self ,
meta_dir : & Path ,
data_dir : & DataDirEnum ,
metrics : & SystemMetrics ,
) {
2023-09-11 19:08:24 +02:00
use nix ::sys ::statvfs ::statvfs ;
let mount_avail = | path : & Path | match statvfs ( path ) {
Ok ( x ) = > {
2023-10-15 17:57:27 +02:00
let avail = x . blocks_available ( ) as u64 * x . fragment_size ( ) as u64 ;
let total = x . blocks ( ) as u64 * x . fragment_size ( ) as u64 ;
2023-09-11 20:00:02 +02:00
Some ( ( x . filesystem_id ( ) , avail , total ) )
2023-09-11 19:08:24 +02:00
}
Err ( _ ) = > None ,
2023-01-26 15:04:32 +01:00
} ;
2023-09-11 20:00:02 +02:00
self . meta_disk_avail = mount_avail ( meta_dir ) . map ( | ( _ , a , t ) | ( a , t ) ) ;
2023-09-04 14:49:49 +02:00
self . data_disk_avail = match data_dir {
2023-09-11 20:00:02 +02:00
DataDirEnum ::Single ( dir ) = > mount_avail ( dir ) . map ( | ( _ , a , t ) | ( a , t ) ) ,
DataDirEnum ::Multiple ( dirs ) = > ( | | {
// TODO: more precise calculation that takes into account
// how data is going to be spread among partitions
let mut mounts = HashMap ::new ( ) ;
for dir in dirs . iter ( ) {
if dir . capacity . is_none ( ) {
continue ;
}
match mount_avail ( & dir . path ) {
Some ( ( fsid , avail , total ) ) = > {
mounts . insert ( fsid , ( avail , total ) ) ;
}
None = > return None ,
}
2023-09-07 14:42:20 +02:00
}
2023-09-11 20:00:02 +02:00
Some (
mounts
. into_iter ( )
. fold ( ( 0 , 0 ) , | ( x , y ) , ( _ , ( a , b ) ) | ( x + a , y + b ) ) ,
)
} ) ( ) ,
2023-09-04 14:49:49 +02:00
} ;
2023-01-26 15:30:36 +01:00
if let Some ( ( avail , total ) ) = self . meta_disk_avail {
metrics
. values
. meta_disk_avail
. store ( avail , Ordering ::Relaxed ) ;
metrics
. values
. meta_disk_total
. store ( total , Ordering ::Relaxed ) ;
}
if let Some ( ( avail , total ) ) = self . data_disk_avail {
metrics
. values
. data_disk_avail
. store ( avail , Ordering ::Relaxed ) ;
metrics
. values
. data_disk_total
. store ( total , Ordering ::Relaxed ) ;
}
2023-01-26 15:04:32 +01:00
}
}
2022-03-16 12:09:50 +01:00
fn get_default_ip ( ) -> Option < IpAddr > {
pnet_datalink ::interfaces ( )
. iter ( )
. find ( | e | e . is_up ( ) & & ! e . is_loopback ( ) & & ! e . ips . is_empty ( ) )
. and_then ( | e | e . ips . first ( ) )
. map ( | a | a . ip ( ) )
}
2023-11-09 12:55:36 +01:00
fn get_rpc_public_addr ( config : & Config ) -> Option < SocketAddr > {
match & config . rpc_public_addr {
Some ( a_str ) = > {
use std ::net ::ToSocketAddrs ;
match a_str . to_socket_addrs ( ) {
Err ( e ) = > {
error! (
" Cannot resolve rpc_public_addr {} from config file: {}. " ,
a_str , e
) ;
None
}
Ok ( a ) = > {
let a = a . collect ::< Vec < _ > > ( ) ;
if a . is_empty ( ) {
error! ( " rpc_public_addr {} resolve to no known IP address " , a_str ) ;
}
if a . len ( ) > 1 {
warn! ( " Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one. " , a ) ;
}
a . into_iter ( ) . next ( )
}
}
}
None = > {
let addr = get_default_ip ( ) . map ( | ip | SocketAddr ::new ( ip , config . rpc_bind_addr . port ( ) ) ) ;
if let Some ( a ) = addr {
warn! ( " Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible. " , a ) ;
}
addr
}
}
}
2022-09-14 16:09:38 +02:00
async fn resolve_peers ( peers : & [ String ] ) -> Vec < ( NodeID , SocketAddr ) > {
let mut ret = vec! [ ] ;
for peer in peers . iter ( ) {
match parse_and_resolve_peer_addr_async ( peer ) . await {
Some ( ( pubkey , addrs ) ) = > {
for ip in addrs {
ret . push ( ( pubkey , ip ) ) ;
}
}
None = > {
warn! ( " Unable to parse and/or resolve peer hostname {} " , peer ) ;
}
}
}
ret
}
2022-12-14 12:57:33 +01:00
2022-12-14 16:11:19 +01:00
fn connect_error_message (
addr : SocketAddr ,
pubkey : ed25519 ::PublicKey ,
e : netapp ::error ::Error ,
) -> String {
format! ( " Error establishing RPC connection to remote node: {} @ {} . \n This can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret. \n {} " , hex ::encode ( pubkey ) , addr , e )
2022-12-14 12:57:33 +01:00
}