Take into account unhealthy nodes

This commit is contained in:
Alex 2023-08-27 15:58:47 +02:00
parent b5e8d1fcd8
commit a3602eac82
3 changed files with 85 additions and 158 deletions

View file

@ -233,7 +233,8 @@ async fn select_target_and_proxy(
.entries .entries
.iter() .iter()
.filter(|ent| { .filter(|ent| {
ent.host.matches(host) ent.flags.healthy
&& ent.host.matches(host)
&& ent && ent
.path_prefix .path_prefix
.as_ref() .as_ref()

View file

@ -245,7 +245,13 @@ async fn dump_config_on_change(
for ((host, prefix), ents) in cfg_map.iter_mut() { for ((host, prefix), ents) in cfg_map.iter_mut() {
println!("{}{}:", host, prefix.as_deref().unwrap_or_default()); println!("{}{}:", host, prefix.as_deref().unwrap_or_default());
for ent in ents.iter() { for ent in ents.iter() {
println!(" {}", ent); print!(" ");
if !ent.flags.healthy {
print!("/!\\ ");
} else {
print!(" ");
}
println!("{}", ent);
} }
} }
println!(); println!();

View file

@ -1,16 +1,13 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::{atomic, Arc}; use std::sync::{atomic, Arc};
use std::{cmp, time::Duration}; use std::time::Duration;
use anyhow::Result; use anyhow::Result;
use opentelemetry::{metrics, KeyValue}; use opentelemetry::{metrics, KeyValue};
use futures::future::BoxFuture;
use futures::stream::{FuturesUnordered, StreamExt};
use log::*; use log::*;
use tokio::{select, sync::watch, time::sleep}; use tokio::{select, sync::watch};
use crate::consul; use crate::consul;
@ -93,6 +90,9 @@ impl Eq for ProxyEntry {}
#[derive(Debug, Clone, Copy, Eq, PartialEq)] #[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub struct ProxyEntryFlags { pub struct ProxyEntryFlags {
/// Is the target healthy?
pub healthy: bool,
/// Is the target the same node as we are running on? /// Is the target the same node as we are running on?
/// (if yes priorize it over other matching targets) /// (if yes priorize it over other matching targets)
pub same_node: bool, pub same_node: bool,
@ -119,6 +119,9 @@ impl std::fmt::Display for ProxyEntry {
self.path_prefix.as_deref().unwrap_or_default(), self.path_prefix.as_deref().unwrap_or_default(),
self.priority self.priority
)?; )?;
if !self.flags.healthy {
write!(f, " UNHEALTHY")?;
}
if self.flags.same_node { if self.flags.same_node {
write!(f, " OURSELF")?; write!(f, " OURSELF")?;
} else if self.flags.same_site { } else if self.flags.same_site {
@ -141,16 +144,6 @@ pub struct ProxyConfig {
pub entries: Vec<ProxyEntry>, pub entries: Vec<ProxyEntry>,
} }
fn retry_to_time(retries: u32, max_time: Duration) -> Duration {
// 1.2^x seems to be a good value to exponentially increase time at a good pace
// eg. 1.2^32 = 341 seconds ~= 5 minutes - ie. after 32 retries we wait 5
// minutes
Duration::from_secs(cmp::min(
max_time.as_secs(),
1.2f64.powf(retries as f64) as u64,
))
}
fn parse_tricot_tag( fn parse_tricot_tag(
service_name: String, service_name: String,
tag: &str, tag: &str,
@ -209,63 +202,55 @@ fn parse_tricot_add_header_tag(tag: &str) -> Option<(String, String)> {
} }
} }
fn parse_consul_catalog( fn parse_consul_service(
catalog: &consul::catalog::CatalogNode, s: &consul::catalog::HealthServiceNode,
same_node: bool, mut flags: ProxyEntryFlags,
same_site: bool,
) -> Vec<ProxyEntry> { ) -> Vec<ProxyEntry> {
trace!("Parsing node catalog: {:#?}", catalog); trace!("Parsing service: {:#?}", s);
let mut entries = vec![]; let mut entries = vec![];
for (_, svc) in catalog.services.iter() { let ip_addr = match s.service.address.parse() {
let ip_addr = match svc.address.parse() {
Ok(ip) => ip, Ok(ip) => ip,
_ => match catalog.node.address.parse() { _ => match s.node.address.parse() {
Ok(ip) => ip, Ok(ip) => ip,
_ => { _ => {
warn!( warn!(
"Could not get address for service {} at node {}", "Could not get address for service {} at node {}",
svc.service, catalog.node.node s.service.service, s.node.node
); );
continue; return vec![];
} }
}, },
}; };
let addr = SocketAddr::new(ip_addr, svc.port); let addr = SocketAddr::new(ip_addr, s.service.port);
let (site_lb, global_lb) = if svc.tags.contains(&"tricot-global-lb".into()) { if s.service.tags.contains(&"tricot-global-lb".into()) {
(false, true) flags.global_lb = true;
} else if svc.tags.contains(&"tricot-site-lb".into()) { } else if s.service.tags.contains(&"tricot-site-lb".into()) {
(true, false) flags.site_lb = true;
} else {
(false, false)
};
let flags = ProxyEntryFlags {
same_node,
same_site,
site_lb,
global_lb,
}; };
let mut add_headers = vec![]; let mut add_headers = vec![];
for tag in svc.tags.iter() { for tag in s.service.tags.iter() {
if let Some(pair) = parse_tricot_add_header_tag(tag) { if let Some(pair) = parse_tricot_add_header_tag(tag) {
add_headers.push(pair); add_headers.push(pair);
} }
} }
for tag in svc.tags.iter() { for tag in s.service.tags.iter() {
if let Some(ent) = if let Some(ent) = parse_tricot_tag(
parse_tricot_tag(svc.service.clone(), tag, addr, &add_headers[..], flags) s.service.service.clone(),
{ tag,
addr,
&add_headers[..],
flags,
) {
entries.push(ent); entries.push(ent);
} }
} }
}
trace!("Result of parsing catalog:"); trace!("Result of parsing service:");
for ent in entries.iter() { for ent in entries.iter() {
trace!(" {}", ent); trace!(" {}", ent);
} }
@ -273,13 +258,6 @@ fn parse_consul_catalog(
entries entries
} }
#[derive(Default)]
struct NodeWatchState {
last_idx: Option<usize>,
last_catalog: Option<consul::catalog::CatalogNode>,
retries: u32,
}
pub fn spawn_proxy_config_task( pub fn spawn_proxy_config_task(
consul: consul::Consul, consul: consul::Consul,
local_node: String, local_node: String,
@ -293,108 +271,50 @@ pub fn spawn_proxy_config_task(
let consul = Arc::new(consul); let consul = Arc::new(consul);
tokio::spawn(async move { tokio::spawn(async move {
let mut nodes = HashMap::new(); let mut catalog_rx = consul.watch_all_service_health(Duration::from_secs(300));
let mut watches = FuturesUnordered::<BoxFuture<'static, (String, Result<_>)>>::new(); let mut local_node_site = None;
let mut node_site = HashMap::new();
while !*must_exit.borrow() { while !*must_exit.borrow() {
let list_nodes = select! { select! {
ln = consul.catalog_node_list(None) => ln, _ = catalog_rx.changed() => (),
_ = must_exit.changed() => continue, _ = must_exit.changed() => continue,
}; };
match list_nodes { let services = catalog_rx.borrow_and_update().clone();
Ok(consul_nodes) => { if local_node_site.is_none() {
info!("Watched consul nodes: {:?}", consul_nodes); for (_, svcnodes) in services.iter() {
for consul_node in consul_nodes.into_inner() { for svcnode in svcnodes.iter() {
let node = &consul_node.node; if svcnode.node.node == local_node {
if !nodes.contains_key(node) { if let Some(site) = svcnode.node.meta.get("site") {
nodes.insert(node.clone(), NodeWatchState::default()); local_node_site = Some(site.to_string());
let node = node.to_string();
let consul = consul.clone();
watches.push(Box::pin(async move {
let res = consul.catalog_node(&node, None).await;
(node, res)
}));
}
if let Some(site) = consul_node.meta.get("site") {
node_site.insert(node.clone(), site.clone());
} }
} }
} }
Err(e) => {
warn!("Could not get Consul node list: {}", e);
}
}
let next_watch = select! {
nw = watches.next() => nw,
_ = must_exit.changed() => continue,
};
let (node, res): (String, Result<_>) = match next_watch {
Some(v) => v,
None => {
warn!("No nodes currently watched in proxy_config.rs");
sleep(Duration::from_secs(10)).await;
continue;
}
};
match res {
Ok(res) => {
let new_idx = res.index();
let catalog = res.into_inner();
let mut watch_state = nodes.get_mut(&node).unwrap();
watch_state.last_idx = Some(new_idx);
watch_state.last_catalog = catalog;
watch_state.retries = 0;
let idx = watch_state.last_idx;
let consul = consul.clone();
watches.push(Box::pin(async move {
let res = consul.catalog_node(&node, idx).await;
(node, res)
}));
}
Err(e) => {
let mut watch_state = nodes.get_mut(&node).unwrap();
watch_state.retries += 1;
watch_state.last_idx = None;
let will_retry_in =
retry_to_time(watch_state.retries, Duration::from_secs(600));
error!(
"Failed to query consul for node {}. Will retry in {}s. {}",
node,
will_retry_in.as_secs(),
e
);
let consul = consul.clone();
watches.push(Box::pin(async move {
sleep(will_retry_in).await;
let res = consul.catalog_node(&node, None).await;
(node, res)
}));
continue;
} }
} }
let mut entries = vec![]; let mut entries = vec![];
for (node_name, watch_state) in nodes.iter() {
if let Some(catalog) = &watch_state.last_catalog { for (_service, svcnodes) in services.iter() {
let same_node = *node_name == local_node; for svcnode in svcnodes.iter() {
let same_site = match (node_site.get(node_name), node_site.get(&local_node)) { let healthy = !svcnode.checks.iter().any(|x| x.status == "critical");
let same_node = svcnode.node.node == local_node;
let same_site = match (svcnode.node.meta.get("site"), local_node_site.as_ref())
{
(Some(s1), Some(s2)) => s1 == s2, (Some(s1), Some(s2)) => s1 == s2,
_ => false, _ => false,
}; };
entries.extend(parse_consul_catalog(catalog, same_node, same_site)); let flags = ProxyEntryFlags {
healthy,
same_node,
same_site,
site_lb: false,
global_lb: false,
};
entries.extend(parse_consul_service(&svcnode, flags));
} }
} }