2021-10-14 09:50:12 +00:00
//! Module containing structs related to membership management
2023-11-27 11:10:21 +00:00
use std ::collections ::{ HashMap , HashSet } ;
2021-10-14 09:50:12 +00:00
use std ::io ::{ Read , Write } ;
2022-03-06 13:50:00 +00:00
use std ::net ::{ IpAddr , SocketAddr } ;
2022-07-08 11:30:26 +00:00
use std ::path ::{ Path , PathBuf } ;
2023-11-09 13:12:05 +00:00
use std ::sync ::{ Arc , RwLock , RwLockReadGuard } ;
2021-10-19 14:16:10 +00:00
use std ::time ::{ Duration , Instant } ;
2021-10-14 09:50:12 +00:00
2024-02-20 13:20:58 +00:00
use arc_swap ::ArcSwapOption ;
2021-10-14 09:50:12 +00:00
use async_trait ::async_trait ;
2023-09-12 12:35:48 +00:00
use futures ::join ;
2021-10-14 09:50:12 +00:00
use serde ::{ Deserialize , Serialize } ;
use sodiumoxide ::crypto ::sign ::ed25519 ;
2023-09-12 12:35:48 +00:00
use tokio ::select ;
2023-11-09 13:12:05 +00:00
use tokio ::sync ::{ watch , Notify } ;
2021-10-14 09:50:12 +00:00
2024-02-13 11:55:41 +00:00
use garage_net ::endpoint ::{ Endpoint , EndpointHandler } ;
use garage_net ::message ::* ;
2024-03-21 09:45:34 +00:00
use garage_net ::peering ::{ PeerConnState , PeeringManager } ;
2024-02-13 11:55:41 +00:00
use garage_net ::util ::parse_and_resolve_peer_addr_async ;
use garage_net ::{ NetApp , NetworkKey , NodeID , NodeKey } ;
2021-10-14 09:50:12 +00:00
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
use garage_util ::config ::KubernetesDiscoveryConfig ;
2023-09-04 12:49:49 +00:00
use garage_util ::config ::{ Config , DataDirEnum } ;
2021-11-09 11:24:04 +00:00
use garage_util ::data ::* ;
2021-10-19 14:16:10 +00:00
use garage_util ::error ::* ;
2021-10-14 09:50:12 +00:00
use garage_util ::persister ::Persister ;
2021-10-15 09:05:09 +00:00
use garage_util ::time ::* ;
2021-10-14 09:50:12 +00:00
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 17:11:16 +00:00
use crate ::consul ::ConsulDiscovery ;
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-03-06 13:50:00 +00:00
use crate ::kubernetes ::* ;
2023-11-16 12:51:40 +00:00
use crate ::layout ::{
2023-12-11 15:09:22 +00:00
self , manager ::LayoutManager , LayoutHelper , LayoutHistory , NodeRoleV , RpcLayoutDigest ,
2023-11-16 12:51:40 +00:00
} ;
2022-12-05 14:28:57 +00:00
use crate ::replication_mode ::* ;
2021-10-15 09:05:09 +00:00
use crate ::rpc_helper ::* ;
2021-10-14 09:50:12 +00:00
2023-01-09 17:15:55 +00:00
use crate ::system_metrics ::* ;
2021-10-14 09:50:12 +00:00
const DISCOVERY_INTERVAL : Duration = Duration ::from_secs ( 60 ) ;
2021-10-15 09:05:09 +00:00
const STATUS_EXCHANGE_INTERVAL : Duration = Duration ::from_secs ( 10 ) ;
2021-10-14 09:50:12 +00:00
2022-09-07 09:59:56 +00:00
/// Version tag used for version check upon Netapp connection.
/// Cluster nodes with different version tags are deemed
/// incompatible and will refuse to connect.
2024-03-28 14:19:44 +00:00
pub const GARAGE_VERSION_TAG : u64 = 0x6761726167650010 ; // garage 0x0010 (1.0)
2022-02-18 19:39:55 +00:00
2021-10-14 09:50:12 +00:00
/// RPC endpoint used for calls related to membership
2023-11-09 11:55:36 +00:00
pub const SYSTEM_RPC_PATH : & str = " garage_rpc/system.rs/SystemRpc " ;
2021-10-14 09:50:12 +00:00
/// RPC messages related to membership
#[ derive(Debug, Serialize, Deserialize, Clone) ]
pub enum SystemRpc {
/// Response to successfull advertisements
Ok ,
2024-01-16 13:04:11 +00:00
/// Request to connect to a specific node (in <pubkey>@<host>:<port> format, pubkey = full-length node ID)
2021-10-15 09:05:09 +00:00
Connect ( String ) ,
2021-10-14 09:50:12 +00:00
/// Advertise Garage status. Answered with another AdvertiseStatus.
/// Exchanged with every node on a regular basis.
2021-10-15 09:05:09 +00:00
AdvertiseStatus ( NodeStatus ) ,
2021-10-14 09:50:12 +00:00
/// Get known nodes states
GetKnownNodes ,
/// Return known nodes
2021-10-15 09:05:09 +00:00
ReturnKnownNodes ( Vec < KnownNodeInfo > ) ,
2023-11-09 13:53:34 +00:00
2023-11-09 11:55:36 +00:00
/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
PullClusterLayout ,
/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
2023-11-09 13:12:05 +00:00
AdvertiseClusterLayout ( LayoutHistory ) ,
2023-11-09 13:53:34 +00:00
/// Ask other node its cluster layout update trackers.
PullClusterLayoutTrackers ,
/// Advertisement of cluster layout update trackers.
AdvertiseClusterLayoutTrackers ( layout ::UpdateTrackers ) ,
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
impl Rpc for SystemRpc {
type Response = Result < SystemRpc , Error > ;
2021-10-14 09:50:12 +00:00
}
2023-01-03 13:44:47 +00:00
#[ derive(Serialize, Deserialize) ]
pub struct PeerList ( Vec < ( Uuid , SocketAddr ) > ) ;
impl garage_util ::migrate ::InitialFormat for PeerList { }
2021-10-14 09:50:12 +00:00
/// This node's membership manager
pub struct System {
/// The id of this node
2021-10-15 09:05:09 +00:00
pub id : Uuid ,
2021-10-14 09:50:12 +00:00
2023-01-03 13:44:47 +00:00
persist_peer_list : Persister < PeerList > ,
2021-10-14 09:50:12 +00:00
2024-02-20 13:20:58 +00:00
pub ( crate ) local_status : RwLock < NodeStatus > ,
2021-10-15 09:05:09 +00:00
node_status : RwLock < HashMap < Uuid , ( u64 , NodeStatus ) > > ,
2021-10-14 09:50:12 +00:00
pub netapp : Arc < NetApp > ,
2024-02-14 09:04:46 +00:00
peering : Arc < PeeringManager > ,
2021-10-14 09:50:12 +00:00
2023-11-09 11:55:36 +00:00
pub ( crate ) system_endpoint : Arc < Endpoint < SystemRpc , System > > ,
2021-10-14 09:50:12 +00:00
rpc_listen_addr : SocketAddr ,
2021-10-15 09:05:09 +00:00
rpc_public_addr : Option < SocketAddr > ,
2022-09-14 14:09:38 +00:00
bootstrap_peers : Vec < String > ,
2022-03-06 13:50:00 +00:00
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 17:11:16 +00:00
consul_discovery : Option < ConsulDiscovery > ,
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-10-18 16:38:20 +00:00
kubernetes_discovery : Option < KubernetesDiscoveryConfig > ,
2022-03-06 13:50:00 +00:00
2023-11-09 12:34:14 +00:00
pub layout_manager : Arc < LayoutManager > ,
2023-11-09 11:55:36 +00:00
2024-02-20 13:20:58 +00:00
metrics : ArcSwapOption < SystemMetrics > ,
2022-03-06 13:50:00 +00:00
2024-03-04 18:58:32 +00:00
pub ( crate ) replication_factor : ReplicationFactor ,
2021-10-14 09:50:12 +00:00
2022-07-08 11:30:26 +00:00
/// Path to metadata directory
pub metadata_dir : PathBuf ,
2023-01-26 14:04:32 +00:00
/// Path to data directory
2023-09-04 12:49:49 +00:00
pub data_dir : DataDirEnum ,
2021-10-14 09:50:12 +00:00
}
#[ derive(Debug, Clone, Serialize, Deserialize) ]
2021-10-15 09:05:09 +00:00
pub struct NodeStatus {
2021-10-14 09:50:12 +00:00
/// Hostname of the node
2023-11-28 13:25:04 +00:00
pub hostname : Option < String > ,
2023-01-26 14:04:32 +00:00
2021-10-14 09:50:12 +00:00
/// Replication factor configured on the node
pub replication_factor : usize ,
2023-11-09 11:55:36 +00:00
2023-11-16 12:51:40 +00:00
/// Cluster layout digest
2023-12-11 15:09:22 +00:00
pub layout_digest : RpcLayoutDigest ,
2023-01-26 14:04:32 +00:00
/// Disk usage on partition containing metadata directory (tuple: `(avail, total)`)
#[ serde(default) ]
pub meta_disk_avail : Option < ( u64 , u64 ) > ,
/// Disk usage on partition containing data directory (tuple: `(avail, total)`)
#[ serde(default) ]
pub data_disk_avail : Option < ( u64 , u64 ) > ,
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
#[ derive(Debug, Clone, Serialize, Deserialize) ]
pub struct KnownNodeInfo {
pub id : Uuid ,
2024-03-21 09:45:34 +00:00
pub addr : Option < SocketAddr > ,
2021-10-15 09:05:09 +00:00
pub is_up : bool ,
2021-10-19 14:16:10 +00:00
pub last_seen_secs_ago : Option < u64 > ,
2021-10-15 09:05:09 +00:00
pub status : NodeStatus ,
}
2023-06-14 11:53:19 +00:00
#[ derive(Debug, Clone, Copy) ]
2022-12-05 14:28:57 +00:00
pub struct ClusterHealth {
/// The current health status of the cluster (see below)
pub status : ClusterHealthStatus ,
/// Number of nodes already seen once in the cluster
pub known_nodes : usize ,
/// Number of nodes currently connected
pub connected_nodes : usize ,
/// Number of storage nodes declared in the current layout
pub storage_nodes : usize ,
/// Number of storage nodes currently connected
pub storage_nodes_ok : usize ,
/// Number of partitions in the layout
pub partitions : usize ,
/// Number of partitions for which we have a quorum of connected nodes
pub partitions_quorum : usize ,
/// Number of partitions for which all storage nodes are connected
pub partitions_all_ok : usize ,
}
2024-02-20 11:37:55 +00:00
#[ derive(Debug, Clone, Copy, Eq, PartialEq) ]
2022-12-05 14:28:57 +00:00
pub enum ClusterHealthStatus {
/// All nodes are available
Healthy ,
/// Some storage nodes are unavailable, but quorum is stil
/// achieved for all partitions
Degraded ,
/// Quorum is not available for some partitions
Unavailable ,
}
2021-10-19 14:16:10 +00:00
pub fn read_node_id ( metadata_dir : & Path ) -> Result < NodeID , Error > {
let mut pubkey_file = metadata_dir . to_path_buf ( ) ;
pubkey_file . push ( " node_key.pub " ) ;
let mut f = std ::fs ::File ::open ( pubkey_file . as_path ( ) ) ? ;
let mut d = vec! [ ] ;
f . read_to_end ( & mut d ) ? ;
if d . len ( ) ! = 32 {
return Err ( Error ::Message ( " Corrupt node_key.pub file " . to_string ( ) ) ) ;
}
let mut key = [ 0 u8 ; 32 ] ;
key . copy_from_slice ( & d [ .. ] ) ;
Ok ( NodeID ::from_slice ( & key [ .. ] ) . unwrap ( ) )
}
pub fn gen_node_key ( metadata_dir : & Path ) -> Result < NodeKey , Error > {
2021-10-15 09:05:09 +00:00
let mut key_file = metadata_dir . to_path_buf ( ) ;
key_file . push ( " node_key " ) ;
if key_file . as_path ( ) . exists ( ) {
let mut f = std ::fs ::File ::open ( key_file . as_path ( ) ) ? ;
2021-10-14 09:50:12 +00:00
let mut d = vec! [ ] ;
f . read_to_end ( & mut d ) ? ;
if d . len ( ) ! = 64 {
2021-10-15 09:05:09 +00:00
return Err ( Error ::Message ( " Corrupt node_key file " . to_string ( ) ) ) ;
2021-10-14 09:50:12 +00:00
}
let mut key = [ 0 u8 ; 64 ] ;
key . copy_from_slice ( & d [ .. ] ) ;
Ok ( NodeKey ::from_slice ( & key [ .. ] ) . unwrap ( ) )
} else {
2021-10-19 14:16:10 +00:00
if ! metadata_dir . exists ( ) {
info! ( " Metadata directory does not exist, creating it. " ) ;
2023-01-26 16:26:32 +00:00
std ::fs ::create_dir ( metadata_dir ) ? ;
2021-10-19 14:16:10 +00:00
}
info! ( " Generating new node key pair. " ) ;
let ( pubkey , key ) = ed25519 ::gen_keypair ( ) ;
{
use std ::os ::unix ::fs ::PermissionsExt ;
let mut f = std ::fs ::File ::create ( key_file . as_path ( ) ) ? ;
let mut perm = f . metadata ( ) ? . permissions ( ) ;
perm . set_mode ( 0o600 ) ;
std ::fs ::set_permissions ( key_file . as_path ( ) , perm ) ? ;
f . write_all ( & key [ .. ] ) ? ;
}
{
let mut pubkey_file = metadata_dir . to_path_buf ( ) ;
pubkey_file . push ( " node_key.pub " ) ;
let mut f2 = std ::fs ::File ::create ( pubkey_file . as_path ( ) ) ? ;
f2 . write_all ( & pubkey [ .. ] ) ? ;
}
2021-10-14 09:50:12 +00:00
2021-10-15 09:05:09 +00:00
Ok ( key )
2021-10-14 09:50:12 +00:00
}
}
impl System {
/// Create this node's membership manager
pub fn new (
network_key : NetworkKey ,
2024-03-04 18:58:32 +00:00
replication_factor : ReplicationFactor ,
consistency_mode : ConsistencyMode ,
2021-10-19 14:16:10 +00:00
config : & Config ,
2022-09-13 14:22:23 +00:00
) -> Result < Arc < Self > , Error > {
2023-11-09 11:55:36 +00:00
// ---- setup netapp RPC protocol ----
2021-10-19 14:16:10 +00:00
let node_key =
gen_node_key ( & config . metadata_dir ) . expect ( " Unable to read or generate node ID " ) ;
2022-02-18 19:39:55 +00:00
info! (
" Node ID of this node: {} " ,
hex ::encode ( & node_key . public_key ( ) [ .. 8 ] )
) ;
2021-10-14 09:50:12 +00:00
2024-02-20 16:02:44 +00:00
let bind_outgoing_to = Some ( config )
. filter ( | x | x . rpc_bind_outgoing )
. map ( | x | x . rpc_bind_addr . ip ( ) ) ;
let netapp = NetApp ::new ( GARAGE_VERSION_TAG , network_key , node_key , bind_outgoing_to ) ;
2023-11-09 11:55:36 +00:00
let system_endpoint = netapp . endpoint ( SYSTEM_RPC_PATH . into ( ) ) ;
2021-10-14 09:50:12 +00:00
2023-11-09 11:55:36 +00:00
// ---- setup netapp public listener and full mesh peering strategy ----
let rpc_public_addr = get_rpc_public_addr ( config ) ;
2022-09-14 14:09:38 +00:00
if rpc_public_addr . is_none ( ) {
warn! ( " This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication. " ) ;
}
2022-03-16 11:09:50 +00:00
2024-02-14 09:04:46 +00:00
let peering = PeeringManager ::new ( netapp . clone ( ) , vec! [ ] , rpc_public_addr ) ;
2022-09-19 18:12:19 +00:00
if let Some ( ping_timeout ) = config . rpc_ping_timeout_msec {
2024-02-14 09:04:46 +00:00
peering . set_ping_timeout_millis ( ping_timeout ) ;
2022-09-19 18:12:19 +00:00
}
2021-10-14 09:50:12 +00:00
2023-11-09 11:55:36 +00:00
let persist_peer_list = Persister ::new ( & config . metadata_dir , " peer_list " ) ;
2021-10-14 09:50:12 +00:00
2023-11-09 11:55:36 +00:00
// ---- setup cluster layout and layout manager ----
let layout_manager = LayoutManager ::new (
config ,
netapp . id ,
system_endpoint . clone ( ) ,
2024-02-15 12:58:15 +00:00
peering . clone ( ) ,
2024-03-04 18:58:32 +00:00
replication_factor ,
consistency_mode ,
2023-11-09 11:55:36 +00:00
) ? ;
2023-11-09 12:34:14 +00:00
let mut local_status = NodeStatus ::initial ( replication_factor , & layout_manager ) ;
2024-02-20 16:02:44 +00:00
local_status . update_disk_usage ( & config . metadata_dir , & config . data_dir ) ;
2023-11-09 11:55:36 +00:00
// ---- if enabled, set up additionnal peer discovery methods ----
2022-10-18 17:13:52 +00:00
#[ cfg(feature = " consul-discovery " ) ]
let consul_discovery = match & config . consul_discovery {
Some ( cfg ) = > Some (
ConsulDiscovery ::new ( cfg . clone ( ) )
. ok_or_message ( " Invalid Consul discovery configuration " ) ? ,
) ,
None = > None ,
} ;
2022-10-18 16:38:20 +00:00
#[ cfg(not(feature = " consul-discovery " )) ]
if config . consul_discovery . is_some ( ) {
warn! ( " Consul discovery is not enabled in this build. " ) ;
}
2022-03-16 11:09:50 +00:00
#[ cfg(not(feature = " kubernetes-discovery " )) ]
2022-10-18 16:38:20 +00:00
if config . kubernetes_discovery . is_some ( ) {
2022-03-16 11:09:50 +00:00
warn! ( " Kubernetes discovery is not enabled in this build. " ) ;
}
2024-02-20 16:02:44 +00:00
// ---- almost done ----
2021-10-14 09:50:12 +00:00
let sys = Arc ::new ( System {
2021-10-15 09:05:09 +00:00
id : netapp . id . into ( ) ,
persist_peer_list ,
2024-02-20 13:20:58 +00:00
local_status : RwLock ::new ( local_status ) ,
2021-10-15 09:05:09 +00:00
node_status : RwLock ::new ( HashMap ::new ( ) ) ,
2021-10-14 09:50:12 +00:00
netapp : netapp . clone ( ) ,
2024-02-14 09:04:46 +00:00
peering : peering . clone ( ) ,
2021-10-14 09:50:12 +00:00
system_endpoint ,
replication_factor ,
2021-10-19 14:16:10 +00:00
rpc_listen_addr : config . rpc_bind_addr ,
2022-03-16 11:09:50 +00:00
rpc_public_addr ,
2021-10-19 14:16:10 +00:00
bootstrap_peers : config . bootstrap_peers . clone ( ) ,
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-10-18 17:11:16 +00:00
consul_discovery ,
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2022-10-18 16:38:20 +00:00
kubernetes_discovery : config . kubernetes_discovery . clone ( ) ,
2023-11-09 11:55:36 +00:00
layout_manager ,
2024-02-20 13:20:58 +00:00
metrics : ArcSwapOption ::new ( None ) ,
2022-03-06 13:50:00 +00:00
2022-07-08 11:30:26 +00:00
metadata_dir : config . metadata_dir . clone ( ) ,
2023-01-26 14:04:32 +00:00
data_dir : config . data_dir . clone ( ) ,
2021-10-14 09:50:12 +00:00
} ) ;
2024-02-20 13:20:58 +00:00
2021-10-14 09:50:12 +00:00
sys . system_endpoint . set_handler ( sys . clone ( ) ) ;
2024-02-20 13:20:58 +00:00
let metrics = SystemMetrics ::new ( sys . clone ( ) ) ;
sys . metrics . store ( Some ( Arc ::new ( metrics ) ) ) ;
2022-09-13 14:22:23 +00:00
Ok ( sys )
2021-10-14 09:50:12 +00:00
}
/// Perform bootstraping, starting the ping loop
pub async fn run ( self : Arc < Self > , must_exit : watch ::Receiver < bool > ) {
join! (
2024-02-19 09:58:54 +00:00
self . netapp . clone ( ) . listen (
self . rpc_listen_addr ,
self . rpc_public_addr ,
must_exit . clone ( )
) ,
2024-02-14 09:04:46 +00:00
self . peering . clone ( ) . run ( must_exit . clone ( ) ) ,
2021-10-14 09:50:12 +00:00
self . discovery_loop ( must_exit . clone ( ) ) ,
2021-10-15 09:05:09 +00:00
self . status_exchange_loop ( must_exit . clone ( ) ) ,
2021-10-14 09:50:12 +00:00
) ;
}
2024-02-20 13:20:58 +00:00
pub fn cleanup ( & self ) {
// Break reference cycle
self . metrics . store ( None ) ;
}
2023-11-09 11:55:36 +00:00
// ---- Public utilities / accessors ----
2023-11-15 13:20:50 +00:00
pub fn cluster_layout ( & self ) -> RwLockReadGuard < '_ , LayoutHelper > {
2023-11-09 12:34:14 +00:00
self . layout_manager . layout ( )
2023-11-09 11:55:36 +00:00
}
2023-11-09 13:12:05 +00:00
pub fn layout_notify ( & self ) -> Arc < Notify > {
self . layout_manager . change_notify . clone ( )
2023-11-09 11:55:36 +00:00
}
pub fn rpc_helper ( & self ) -> & RpcHelper {
& self . layout_manager . rpc_helper
}
2022-05-24 10:16:39 +00:00
// ---- Administrative operations (directly available and
// also available through RPC) ----
pub fn get_known_nodes ( & self ) -> Vec < KnownNodeInfo > {
let node_status = self . node_status . read ( ) . unwrap ( ) ;
let known_nodes = self
2024-02-14 09:04:46 +00:00
. peering
2022-05-24 10:16:39 +00:00
. get_peer_list ( )
. iter ( )
. map ( | n | KnownNodeInfo {
id : n . id . into ( ) ,
2024-03-21 09:45:34 +00:00
addr : match n . state {
PeerConnState ::Ourself = > self . rpc_public_addr ,
PeerConnState ::Connected { addr } = > Some ( addr ) ,
_ = > None ,
} ,
2022-05-24 10:16:39 +00:00
is_up : n . is_up ( ) ,
2022-09-29 13:53:54 +00:00
last_seen_secs_ago : n
. last_seen
. map ( | t | ( Instant ::now ( ) . saturating_duration_since ( t ) ) . as_secs ( ) ) ,
2022-05-24 10:16:39 +00:00
status : node_status
. get ( & n . id . into ( ) )
. cloned ( )
. map ( | ( _ , st ) | st )
2023-01-26 16:26:32 +00:00
. unwrap_or_else ( NodeStatus ::unknown ) ,
2022-05-24 10:16:39 +00:00
} )
. collect ::< Vec < _ > > ( ) ;
known_nodes
}
pub async fn connect ( & self , node : & str ) -> Result < ( ) , Error > {
2022-09-14 14:09:38 +00:00
let ( pubkey , addrs ) = parse_and_resolve_peer_addr_async ( node )
. await
. ok_or_else ( | | {
Error ::Message ( format! (
" Unable to parse or resolve node specification: {} " ,
node
) )
} ) ? ;
2022-05-24 10:16:39 +00:00
let mut errors = vec! [ ] ;
2022-12-14 11:57:33 +00:00
for addr in addrs . iter ( ) {
2022-12-14 15:11:19 +00:00
match self . netapp . clone ( ) . try_connect ( * addr , pubkey ) . await {
2022-05-24 10:16:39 +00:00
Ok ( ( ) ) = > return Ok ( ( ) ) ,
Err ( e ) = > {
2022-12-14 15:11:19 +00:00
errors . push ( (
* addr ,
Error ::Message ( connect_error_message ( * addr , pubkey , e ) ) ,
) ) ;
2022-05-24 10:16:39 +00:00
}
}
}
if errors . len ( ) = = 1 {
Err ( Error ::Message ( errors [ 0 ] . 1. to_string ( ) ) )
} else {
Err ( Error ::Message ( format! ( " {:?} " , errors ) ) )
}
}
2022-12-05 14:28:57 +00:00
pub fn health ( & self ) -> ClusterHealth {
2024-03-04 18:58:32 +00:00
let quorum = self
. replication_factor
. write_quorum ( ConsistencyMode ::Consistent ) ;
2022-12-05 14:28:57 +00:00
2023-11-27 11:10:21 +00:00
// Gather information about running nodes.
// Technically, `nodes` contains currently running nodes, as well
// as nodes that this Garage process has been connected to at least
// once since it started.
2022-12-05 14:28:57 +00:00
let nodes = self
. get_known_nodes ( )
. into_iter ( )
. map ( | n | ( n . id , n ) )
. collect ::< HashMap < Uuid , _ > > ( ) ;
let connected_nodes = nodes . iter ( ) . filter ( | ( _ , n ) | n . is_up ) . count ( ) ;
2023-11-27 11:10:21 +00:00
let node_up = | x : & Uuid | nodes . get ( x ) . map ( | n | n . is_up ) . unwrap_or ( false ) ;
// Acquire a rwlock read-lock to the current cluster layout
let layout = self . cluster_layout ( ) ;
// Obtain information about nodes that have a role as storage nodes
// in one of the active layout versions
let mut storage_nodes = HashSet ::< Uuid > ::with_capacity ( 16 ) ;
2024-03-27 12:32:13 +00:00
for ver in layout . versions ( ) . iter ( ) {
2023-11-27 11:10:21 +00:00
storage_nodes . extend (
ver . roles
. items ( )
. iter ( )
. filter ( | ( _ , _ , v ) | matches! ( v , NodeRoleV ( Some ( r ) ) if r . capacity . is_some ( ) ) )
. map ( | ( n , _ , _ ) | * n ) ,
)
}
let storage_nodes_ok = storage_nodes . iter ( ) . filter ( | x | node_up ( x ) ) . count ( ) ;
2022-12-05 14:28:57 +00:00
2023-11-27 11:10:21 +00:00
// Determine the number of partitions that have:
// - a quorum of up nodes for all write sets (i.e. are available)
// - for which all nodes in all write sets are up (i.e. are fully healthy)
2023-11-11 11:08:32 +00:00
let partitions = layout . current ( ) . partitions ( ) . collect ::< Vec < _ > > ( ) ;
2023-11-27 11:10:21 +00:00
let mut partitions_quorum = 0 ;
let mut partitions_all_ok = 0 ;
for ( _ , hash ) in partitions . iter ( ) {
2023-12-08 09:36:37 +00:00
let mut write_sets = layout
2024-03-27 12:32:13 +00:00
. versions ( )
2023-11-27 11:10:21 +00:00
. iter ( )
. map ( | x | x . nodes_of ( hash , x . replication_factor ) ) ;
let has_quorum = write_sets
. clone ( )
. all ( | set | set . filter ( | x | node_up ( x ) ) . count ( ) > = quorum ) ;
2023-12-08 09:36:37 +00:00
let all_ok = write_sets . all ( | mut set | set . all ( | x | node_up ( & x ) ) ) ;
2023-11-27 11:10:21 +00:00
if has_quorum {
partitions_quorum + = 1 ;
}
if all_ok {
partitions_all_ok + = 1 ;
}
}
2022-12-05 14:28:57 +00:00
2023-11-27 11:10:21 +00:00
// Determine overall cluster status
2022-12-05 14:28:57 +00:00
let status =
2023-12-08 09:36:37 +00:00
if partitions_all_ok = = partitions . len ( ) & & storage_nodes_ok = = storage_nodes . len ( ) {
2022-12-05 14:28:57 +00:00
ClusterHealthStatus ::Healthy
} else if partitions_quorum = = partitions . len ( ) {
ClusterHealthStatus ::Degraded
} else {
ClusterHealthStatus ::Unavailable
} ;
ClusterHealth {
status ,
known_nodes : nodes . len ( ) ,
connected_nodes ,
storage_nodes : storage_nodes . len ( ) ,
storage_nodes_ok ,
partitions : partitions . len ( ) ,
partitions_quorum ,
partitions_all_ok ,
}
}
2021-10-14 09:50:12 +00:00
// ---- INTERNALS ----
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2023-01-03 15:55:59 +00:00
async fn advertise_to_consul ( self : Arc < Self > ) {
2022-03-16 11:09:50 +00:00
let c = match & self . consul_discovery {
Some ( c ) = > c ,
2023-01-03 15:55:59 +00:00
_ = > return ,
2022-03-16 11:09:50 +00:00
} ;
2021-10-15 09:05:09 +00:00
let rpc_public_addr = match self . rpc_public_addr {
Some ( addr ) = > addr ,
None = > {
2022-03-16 11:09:50 +00:00
warn! ( " Not advertising to Consul because rpc_public_addr is not defined in config file and could not be autodetected. " ) ;
2023-01-03 15:55:59 +00:00
return ;
2021-10-15 09:05:09 +00:00
}
} ;
2024-02-22 14:53:47 +00:00
let hostname = self . local_status . read ( ) . unwrap ( ) . hostname . clone ( ) . unwrap ( ) ;
2023-01-03 15:55:59 +00:00
if let Err ( e ) = c
2024-02-20 10:35:18 +00:00
. publish_consul_service ( self . netapp . id , & hostname , rpc_public_addr )
2023-01-03 15:55:59 +00:00
. await
{
error! ( " Error while publishing Consul service: {} " , e ) ;
}
2021-10-15 09:05:09 +00:00
}
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2023-01-03 15:55:59 +00:00
async fn advertise_to_kubernetes ( self : Arc < Self > ) {
2022-03-16 11:09:50 +00:00
let k = match & self . kubernetes_discovery {
Some ( k ) = > k ,
2023-01-03 15:55:59 +00:00
_ = > return ,
2022-03-16 11:09:50 +00:00
} ;
let rpc_public_addr = match self . rpc_public_addr {
Some ( addr ) = > addr ,
None = > {
warn! ( " Not advertising to Kubernetes because rpc_public_addr is not defined in config file and could not be autodetected. " ) ;
2023-01-03 15:55:59 +00:00
return ;
2022-03-16 11:09:50 +00:00
}
} ;
2022-03-06 13:50:00 +00:00
2024-02-22 14:53:47 +00:00
let hostname = self . local_status . read ( ) . unwrap ( ) . hostname . clone ( ) . unwrap ( ) ;
2024-02-20 10:35:18 +00:00
if let Err ( e ) = publish_kubernetes_node ( k , self . netapp . id , & hostname , rpc_public_addr ) . await
2023-01-03 15:55:59 +00:00
{
error! ( " Error while publishing node to Kubernetes: {} " , e ) ;
}
2022-03-06 13:50:00 +00:00
}
2021-10-15 09:05:09 +00:00
fn update_local_status ( & self ) {
2024-02-20 10:35:18 +00:00
let mut local_status = self . local_status . write ( ) . unwrap ( ) ;
2024-02-20 16:02:44 +00:00
local_status . layout_digest = self . layout_manager . layout ( ) . digest ( ) ;
2024-02-20 10:35:18 +00:00
local_status . update_disk_usage ( & self . metadata_dir , & self . data_dir ) ;
2021-10-15 09:05:09 +00:00
}
2022-05-24 10:16:39 +00:00
// --- RPC HANDLERS ---
2021-10-15 09:05:09 +00:00
async fn handle_connect ( & self , node : & str ) -> Result < SystemRpc , Error > {
2022-05-24 10:16:39 +00:00
self . connect ( node ) . await ? ;
Ok ( SystemRpc ::Ok )
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
fn handle_get_known_nodes ( & self ) -> SystemRpc {
2022-05-24 10:16:39 +00:00
let known_nodes = self . get_known_nodes ( ) ;
2021-10-15 09:05:09 +00:00
SystemRpc ::ReturnKnownNodes ( known_nodes )
}
async fn handle_advertise_status (
self : & Arc < Self > ,
from : Uuid ,
info : & NodeStatus ,
) -> Result < SystemRpc , Error > {
2024-02-20 10:35:18 +00:00
let local_info = self . local_status . read ( ) . unwrap ( ) ;
2021-10-15 09:05:09 +00:00
if local_info . replication_factor < info . replication_factor {
2022-09-13 14:22:23 +00:00
error! ( " Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety. " ,
2021-10-15 09:05:09 +00:00
info . replication_factor ,
local_info . replication_factor ) ;
std ::process ::exit ( 1 ) ;
}
2023-11-09 12:34:14 +00:00
self . layout_manager
2023-11-16 12:51:40 +00:00
. handle_advertise_status ( from , & info . layout_digest ) ;
2021-10-15 09:05:09 +00:00
2024-02-20 10:35:18 +00:00
drop ( local_info ) ;
2021-10-15 09:05:09 +00:00
self . node_status
. write ( )
. unwrap ( )
. insert ( from , ( now_msec ( ) , info . clone ( ) ) ) ;
Ok ( SystemRpc ::Ok )
}
async fn status_exchange_loop ( & self , mut stop_signal : watch ::Receiver < bool > ) {
while ! * stop_signal . borrow ( ) {
2023-09-12 12:35:48 +00:00
let restart_at = Instant ::now ( ) + STATUS_EXCHANGE_INTERVAL ;
2021-10-15 09:05:09 +00:00
2024-02-20 11:37:55 +00:00
// Update local node status that is exchanged.
2021-10-15 09:05:09 +00:00
self . update_local_status ( ) ;
2024-02-20 11:37:55 +00:00
2024-02-20 10:35:18 +00:00
let local_status : NodeStatus = self . local_status . read ( ) . unwrap ( ) . clone ( ) ;
2022-07-22 13:20:00 +00:00
let _ = self
2023-11-09 11:55:36 +00:00
. rpc_helper ( )
2021-10-15 09:05:09 +00:00
. broadcast (
& self . system_endpoint ,
SystemRpc ::AdvertiseStatus ( local_status ) ,
2023-09-12 12:35:48 +00:00
RequestStrategy ::with_priority ( PRIO_HIGH )
. with_custom_timeout ( STATUS_EXCHANGE_INTERVAL ) ,
2021-10-15 09:05:09 +00:00
)
. await ;
select! {
2023-09-12 12:35:48 +00:00
_ = tokio ::time ::sleep_until ( restart_at . into ( ) ) = > { } ,
_ = stop_signal . changed ( ) = > { } ,
2021-10-15 09:05:09 +00:00
}
}
}
async fn discovery_loop ( self : & Arc < Self > , mut stop_signal : watch ::Receiver < bool > ) {
2021-10-14 09:50:12 +00:00
while ! * stop_signal . borrow ( ) {
2024-02-16 09:50:41 +00:00
let n_connected = self
2024-02-14 09:04:46 +00:00
. peering
2021-10-14 09:50:12 +00:00
. get_peer_list ( )
. iter ( )
. filter ( | p | p . is_up ( ) )
2024-02-16 09:50:41 +00:00
. count ( ) ;
2024-03-27 12:37:20 +00:00
let not_configured = ! self . cluster_layout ( ) . is_check_ok ( ) ;
2024-03-04 18:58:32 +00:00
let no_peers = n_connected < self . replication_factor . into ( ) ;
2024-02-20 16:02:44 +00:00
let expected_n_nodes = self . cluster_layout ( ) . all_nodes ( ) . len ( ) ;
2024-02-16 09:50:41 +00:00
let bad_peers = n_connected ! = expected_n_nodes ;
2021-10-14 09:50:12 +00:00
if not_configured | | no_peers | | bad_peers {
info! ( " Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {}) " , not_configured , no_peers , bad_peers ) ;
2022-09-14 14:09:38 +00:00
let mut ping_list = resolve_peers ( & self . bootstrap_peers ) . await ;
2021-10-14 09:50:12 +00:00
2021-10-15 09:05:09 +00:00
// Add peer list from list stored on disk
if let Ok ( peers ) = self . persist_peer_list . load_async ( ) . await {
2023-01-03 13:44:47 +00:00
ping_list . extend ( peers . 0. iter ( ) . map ( | ( id , addr ) | ( ( * id ) . into ( ) , * addr ) ) )
2021-10-14 09:50:12 +00:00
}
2021-10-15 09:05:09 +00:00
// Fetch peer list from Consul
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2022-03-16 11:09:50 +00:00
if let Some ( c ) = & self . consul_discovery {
2022-10-18 17:11:16 +00:00
match c . get_consul_nodes ( ) . await {
2021-10-14 09:50:12 +00:00
Ok ( node_list ) = > {
2021-10-15 09:05:09 +00:00
ping_list . extend ( node_list ) ;
2021-10-14 09:50:12 +00:00
}
Err ( e ) = > {
warn! ( " Could not retrieve node list from Consul: {} " , e ) ;
}
}
}
2022-03-06 13:50:00 +00:00
// Fetch peer list from Kubernetes
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
if let Some ( k ) = & self . kubernetes_discovery {
if ! k . skip_crd {
2022-03-06 13:50:00 +00:00
match create_kubernetes_crd ( ) . await {
Ok ( ( ) ) = > ( ) ,
Err ( e ) = > {
error! ( " Failed to create kubernetes custom resource: {} " , e )
}
} ;
}
2022-10-18 16:38:20 +00:00
match get_kubernetes_nodes ( k ) . await {
2022-03-06 13:50:00 +00:00
Ok ( node_list ) = > {
ping_list . extend ( node_list ) ;
}
Err ( e ) = > {
warn! ( " Could not retrieve node list from Kubernetes: {} " , e ) ;
}
}
}
2024-02-16 09:50:41 +00:00
if ! not_configured & & ! no_peers {
// If the layout is configured, and we already have some connections
// to other nodes in the cluster, we can skip trying to connect to
// nodes that are not in the cluster layout.
2024-02-20 16:02:44 +00:00
let layout = self . cluster_layout ( ) ;
ping_list . retain ( | ( id , _ ) | layout . all_nodes ( ) . contains ( & ( * id ) . into ( ) ) ) ;
2024-02-16 09:50:41 +00:00
}
2021-10-14 09:50:12 +00:00
for ( node_id , node_addr ) in ping_list {
2022-12-14 15:08:05 +00:00
let self2 = self . clone ( ) ;
tokio ::spawn ( async move {
if let Err ( e ) = self2 . netapp . clone ( ) . try_connect ( node_addr , node_id ) . await {
2022-12-14 15:11:19 +00:00
error! ( " {} " , connect_error_message ( node_addr , node_id , e ) ) ;
2022-12-14 15:08:05 +00:00
}
} ) ;
2021-10-14 09:50:12 +00:00
}
}
2021-11-03 16:34:44 +00:00
if let Err ( e ) = self . save_peer_list ( ) . await {
2021-10-15 09:05:09 +00:00
warn! ( " Could not save peer list to file: {} " , e ) ;
}
2022-10-18 16:38:20 +00:00
#[ cfg(feature = " consul-discovery " ) ]
2023-01-03 15:55:59 +00:00
tokio ::spawn ( self . clone ( ) . advertise_to_consul ( ) ) ;
2022-03-16 11:09:50 +00:00
#[ cfg(feature = " kubernetes-discovery " ) ]
2023-01-03 15:55:59 +00:00
tokio ::spawn ( self . clone ( ) . advertise_to_kubernetes ( ) ) ;
2021-10-15 09:05:09 +00:00
2021-10-14 09:50:12 +00:00
select! {
2023-09-12 12:35:48 +00:00
_ = tokio ::time ::sleep ( DISCOVERY_INTERVAL ) = > { } ,
_ = stop_signal . changed ( ) = > { } ,
2021-10-14 09:50:12 +00:00
}
}
}
2021-11-03 16:34:44 +00:00
async fn save_peer_list ( & self ) -> Result < ( ) , Error > {
// Prepare new peer list to save to file
// It is a vec of tuples (node ID as Uuid, node SocketAddr)
let mut peer_list = self
2024-02-14 09:04:46 +00:00
. peering
2021-11-03 16:34:44 +00:00
. get_peer_list ( )
. iter ( )
2024-03-21 09:45:34 +00:00
. filter_map ( | n | match n . state {
PeerConnState ::Connected { addr } = > Some ( ( n . id . into ( ) , addr ) ) ,
_ = > None ,
} )
2021-11-03 16:34:44 +00:00
. collect ::< Vec < _ > > ( ) ;
// Before doing it, we read the current peer list file (if it exists)
// and append it to the list we are about to save,
// so that no peer ID gets lost in the process.
if let Ok ( mut prev_peer_list ) = self . persist_peer_list . load_async ( ) . await {
2023-01-03 13:44:47 +00:00
prev_peer_list
. 0
. retain ( | ( id , _ip ) | peer_list . iter ( ) . all ( | ( id2 , _ip2 ) | id2 ! = id ) ) ;
peer_list . extend ( prev_peer_list . 0 ) ;
2021-11-03 16:34:44 +00:00
}
// Save new peer list to file
2023-01-03 13:44:47 +00:00
self . persist_peer_list
. save_async ( & PeerList ( peer_list ) )
. await
2021-11-03 16:34:44 +00:00
}
2021-10-14 09:50:12 +00:00
}
#[ async_trait ]
impl EndpointHandler < SystemRpc > for System {
2021-10-15 09:05:09 +00:00
async fn handle ( self : & Arc < Self > , msg : & SystemRpc , from : NodeID ) -> Result < SystemRpc , Error > {
match msg {
2023-11-09 11:55:36 +00:00
// ---- system functions -> System ----
2021-10-15 09:05:09 +00:00
SystemRpc ::Connect ( node ) = > self . handle_connect ( node ) . await ,
SystemRpc ::AdvertiseStatus ( adv ) = > self . handle_advertise_status ( from . into ( ) , adv ) . await ,
2023-11-09 11:55:36 +00:00
SystemRpc ::GetKnownNodes = > Ok ( self . handle_get_known_nodes ( ) ) ,
// ---- layout functions -> LayoutManager ----
SystemRpc ::PullClusterLayout = > Ok ( self . layout_manager . handle_pull_cluster_layout ( ) ) ,
2021-11-09 11:24:04 +00:00
SystemRpc ::AdvertiseClusterLayout ( adv ) = > {
2023-11-09 11:55:36 +00:00
self . layout_manager
. handle_advertise_cluster_layout ( adv )
. await
2021-11-09 11:24:04 +00:00
}
2023-11-09 13:53:34 +00:00
SystemRpc ::PullClusterLayoutTrackers = > {
Ok ( self . layout_manager . handle_pull_cluster_layout_trackers ( ) )
}
SystemRpc ::AdvertiseClusterLayoutTrackers ( adv ) = > {
self . layout_manager
. handle_advertise_cluster_layout_trackers ( adv )
. await
}
2023-11-09 11:55:36 +00:00
// ---- other -> Error ----
2022-01-03 12:58:05 +00:00
m = > Err ( Error ::unexpected_rpc_message ( m ) ) ,
2021-10-14 09:50:12 +00:00
}
}
}
2022-03-16 11:09:50 +00:00
2023-01-26 14:04:32 +00:00
impl NodeStatus {
2024-03-04 18:58:32 +00:00
fn initial ( replication_factor : ReplicationFactor , layout_manager : & LayoutManager ) -> Self {
2023-01-26 14:04:32 +00:00
NodeStatus {
2023-11-28 13:25:04 +00:00
hostname : Some (
gethostname ::gethostname ( )
. into_string ( )
. unwrap_or_else ( | _ | " <invalid utf-8> " . to_string ( ) ) ,
) ,
2024-03-04 18:58:32 +00:00
replication_factor : replication_factor . into ( ) ,
2023-11-16 12:51:40 +00:00
layout_digest : layout_manager . layout ( ) . digest ( ) ,
2023-01-26 14:04:32 +00:00
meta_disk_avail : None ,
data_disk_avail : None ,
}
}
fn unknown ( ) -> Self {
NodeStatus {
2023-11-28 13:25:04 +00:00
hostname : None ,
2023-01-26 14:04:32 +00:00
replication_factor : 0 ,
2023-11-16 12:51:40 +00:00
layout_digest : Default ::default ( ) ,
2023-01-26 14:04:32 +00:00
meta_disk_avail : None ,
data_disk_avail : None ,
}
}
2024-02-20 10:35:18 +00:00
fn update_disk_usage ( & mut self , meta_dir : & Path , data_dir : & DataDirEnum ) {
2023-09-11 17:08:24 +00:00
use nix ::sys ::statvfs ::statvfs ;
2025-01-04 11:50:10 +00:00
// The HashMap used below requires a filesystem identifier from statfs (instead of statvfs) on FreeBSD, as
// FreeBSD's statvfs filesystem identifier is "not meaningful in this implementation" (man 3 statvfs).
#[ cfg(target_os = " freebsd " ) ]
let get_filesystem_id = | path : & Path | match nix ::sys ::statfs ::statfs ( path ) {
Ok ( fs ) = > Some ( fs . filesystem_id ( ) ) ,
Err ( _ ) = > None ,
} ;
2023-09-11 17:08:24 +00:00
let mount_avail = | path : & Path | match statvfs ( path ) {
Ok ( x ) = > {
2023-10-15 15:57:27 +00:00
let avail = x . blocks_available ( ) as u64 * x . fragment_size ( ) as u64 ;
let total = x . blocks ( ) as u64 * x . fragment_size ( ) as u64 ;
2023-09-11 18:00:02 +00:00
Some ( ( x . filesystem_id ( ) , avail , total ) )
2023-09-11 17:08:24 +00:00
}
Err ( _ ) = > None ,
2023-01-26 14:04:32 +00:00
} ;
2023-09-11 18:00:02 +00:00
self . meta_disk_avail = mount_avail ( meta_dir ) . map ( | ( _ , a , t ) | ( a , t ) ) ;
2025-01-04 11:50:10 +00:00
2023-09-04 12:49:49 +00:00
self . data_disk_avail = match data_dir {
2023-09-11 18:00:02 +00:00
DataDirEnum ::Single ( dir ) = > mount_avail ( dir ) . map ( | ( _ , a , t ) | ( a , t ) ) ,
DataDirEnum ::Multiple ( dirs ) = > ( | | {
// TODO: more precise calculation that takes into account
// how data is going to be spread among partitions
let mut mounts = HashMap ::new ( ) ;
for dir in dirs . iter ( ) {
if dir . capacity . is_none ( ) {
continue ;
}
2025-01-04 11:50:10 +00:00
#[ cfg(not(target_os = " freebsd " )) ]
2023-09-11 18:00:02 +00:00
match mount_avail ( & dir . path ) {
Some ( ( fsid , avail , total ) ) = > {
mounts . insert ( fsid , ( avail , total ) ) ;
2025-01-04 13:46:42 +00:00
} ,
2023-09-11 18:00:02 +00:00
None = > return None ,
}
2025-01-04 11:50:10 +00:00
#[ cfg(target_os = " freebsd " ) ]
match get_filesystem_id ( & dir . path ) {
Some ( fsid ) = > match mount_avail ( & dir . path ) {
Some ( ( _ , avail , total ) ) = > {
mounts . insert ( fsid , ( avail , total ) ) ;
2025-01-04 13:46:42 +00:00
} ,
2025-01-04 11:50:10 +00:00
None = > return None ,
2025-01-04 13:46:42 +00:00
} ,
2025-01-04 11:50:10 +00:00
None = > return None ,
}
2023-09-07 12:42:20 +00:00
}
2023-09-11 18:00:02 +00:00
Some (
mounts
. into_iter ( )
. fold ( ( 0 , 0 ) , | ( x , y ) , ( _ , ( a , b ) ) | ( x + a , y + b ) ) ,
)
} ) ( ) ,
2023-09-04 12:49:49 +00:00
} ;
2023-01-26 14:04:32 +00:00
}
}
2024-04-23 08:57:43 +00:00
/// Obtain the list of currently available IP addresses on all non-loopback
/// interfaces, optionally filtering them to be inside a given IpNet.
fn get_default_ip ( filter_ipnet : Option < ipnet ::IpNet > ) -> Option < IpAddr > {
2022-03-16 11:09:50 +00:00
pnet_datalink ::interfaces ( )
2024-04-23 08:57:43 +00:00
. into_iter ( )
// filter down and loopback interfaces
. filter ( | i | i . is_up ( ) & & ! i . is_loopback ( ) )
// get all IPs
. flat_map ( | e | e . ips )
// optionally, filter to be inside filter_ipnet
. find ( | ipn | {
filter_ipnet . is_some_and ( | ipnet | ipnet . contains ( & ipn . ip ( ) ) ) | | filter_ipnet . is_none ( )
} )
. map ( | ipn | ipn . ip ( ) )
2022-03-16 11:09:50 +00:00
}
2023-11-09 11:55:36 +00:00
fn get_rpc_public_addr ( config : & Config ) -> Option < SocketAddr > {
match & config . rpc_public_addr {
Some ( a_str ) = > {
use std ::net ::ToSocketAddrs ;
match a_str . to_socket_addrs ( ) {
Err ( e ) = > {
error! (
" Cannot resolve rpc_public_addr {} from config file: {}. " ,
a_str , e
) ;
None
}
Ok ( a ) = > {
let a = a . collect ::< Vec < _ > > ( ) ;
if a . is_empty ( ) {
error! ( " rpc_public_addr {} resolve to no known IP address " , a_str ) ;
}
if a . len ( ) > 1 {
warn! ( " Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one. " , a ) ;
}
a . into_iter ( ) . next ( )
}
}
}
None = > {
2024-04-23 08:57:43 +00:00
// `No rpc_public_addr` specified, try to discover one, optionally filtering by `rpc_public_addr_subnet`.
let filter_subnet : Option < ipnet ::IpNet > = config
. rpc_public_addr_subnet
. as_ref ( )
. and_then ( | filter_subnet_str | match filter_subnet_str . parse ::< ipnet ::IpNet > ( ) {
Ok ( filter_subnet ) = > {
let filter_subnet_trunc = filter_subnet . trunc ( ) ;
if filter_subnet_trunc ! = filter_subnet {
warn! ( " `rpc_public_addr_subnet` changed after applying netmask, continuing with {} " , filter_subnet . trunc ( ) ) ;
}
Some ( filter_subnet_trunc )
}
Err ( e ) = > {
panic! (
" Cannot parse rpc_public_addr_subnet {} from config file: {}. Bailing out. " ,
filter_subnet_str , e
) ;
}
} ) ;
let addr = get_default_ip ( filter_subnet )
. map ( | ip | SocketAddr ::new ( ip , config . rpc_bind_addr . port ( ) ) ) ;
2023-11-09 11:55:36 +00:00
if let Some ( a ) = addr {
warn! ( " Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible. " , a ) ;
}
addr
}
}
}
2022-09-14 14:09:38 +00:00
async fn resolve_peers ( peers : & [ String ] ) -> Vec < ( NodeID , SocketAddr ) > {
let mut ret = vec! [ ] ;
for peer in peers . iter ( ) {
match parse_and_resolve_peer_addr_async ( peer ) . await {
Some ( ( pubkey , addrs ) ) = > {
for ip in addrs {
ret . push ( ( pubkey , ip ) ) ;
}
}
None = > {
warn! ( " Unable to parse and/or resolve peer hostname {} " , peer ) ;
}
}
}
ret
}
2022-12-14 11:57:33 +00:00
2022-12-14 15:11:19 +00:00
fn connect_error_message (
addr : SocketAddr ,
pubkey : ed25519 ::PublicKey ,
2024-02-13 11:55:41 +00:00
e : garage_net ::error ::Error ,
2022-12-14 15:11:19 +00:00
) -> String {
format! ( " Error establishing RPC connection to remote node: {} @ {} . \n This can happen if the remote node is not reachable on the network, but also if the two nodes are not configured with the same rpc_secret. \n {} " , hex ::encode ( pubkey ) , addr , e )
2022-12-14 11:57:33 +00:00
}