Garage v1.0 #683

Merged
lx merged 119 commits from next-0.10 into main 2024-04-10 15:23:13 +00:00
4 changed files with 109 additions and 33 deletions
Showing only changes of commit 03ebf18830 - Show all commits

View file

@ -1,3 +1,5 @@
use std::collections::HashSet;
use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::crdt::{Crdt, Lww, LwwMap};
use garage_util::data::*; use garage_util::data::*;
use garage_util::encode::nonversioned_encode; use garage_util::encode::nonversioned_encode;
@ -30,6 +32,14 @@ impl LayoutHistory {
self.versions.last().as_ref().unwrap() self.versions.last().as_ref().unwrap()
} }
pub fn all_storage_nodes(&self) -> HashSet<Uuid> {
self.versions
.iter()
.map(|x| x.nongateway_nodes())
.flatten()
.collect::<HashSet<_>>()
}
pub(crate) fn update_hashes(&mut self) { pub(crate) fn update_hashes(&mut self) {
self.trackers_hash = self.calculate_trackers_hash(); self.trackers_hash = self.calculate_trackers_hash();
self.staging_hash = self.calculate_staging_hash(); self.staging_hash = self.calculate_staging_hash();
@ -43,6 +53,65 @@ impl LayoutHistory {
blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) blake2sum(&nonversioned_encode(&self.staging).unwrap()[..])
} }
// ------------------ update tracking ---------------
pub(crate) fn update_trackers(&mut self, node_id: Uuid) {
// Ensure trackers for this node's values are up-to-date
// 1. Acknowledge the last layout version in the history
self.ack_last(node_id);
// 2. Assume the data on this node is sync'ed up at least to
// the first layout version in the history
self.sync_first(node_id);
// 3. Acknowledge everyone has synced up to min(self.sync_map)
self.sync_ack(node_id);
// 4. Cleanup layout versions that are not needed anymore
self.cleanup_old_versions();
info!("ack_map: {:?}", self.update_trackers.ack_map);
info!("sync_map: {:?}", self.update_trackers.sync_map);
info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map);
// Finally, update hashes
self.update_hashes();
}
pub(crate) fn ack_last(&mut self, node: Uuid) {
let last_version = self.current().version;
self.update_trackers.ack_map.set_max(node, last_version);
}
pub(crate) fn sync_first(&mut self, node: Uuid) {
let first_version = self.versions.first().as_ref().unwrap().version;
self.update_trackers.sync_map.set_max(node, first_version);
}
pub(crate) fn sync_ack(&mut self, node: Uuid) {
self.update_trackers.sync_ack_map.set_max(
node,
self.calculate_global_min(&self.update_trackers.sync_map),
);
}
pub(crate) fn cleanup_old_versions(&mut self) {
let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map);
while self.versions.first().as_ref().unwrap().version < min_sync_ack {
self.versions.remove(0);
}
}
pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 {
let storage_nodes = self.all_storage_nodes();
storage_nodes
.iter()
.map(|x| tracker.0.get(x).copied().unwrap_or(0))
.min()
.unwrap_or(0)
}
// ================== updates to layout, public interface =================== // ================== updates to layout, public interface ===================
pub fn merge(&mut self, other: &LayoutHistory) -> bool { pub fn merge(&mut self, other: &LayoutHistory) -> bool {
@ -78,11 +147,6 @@ impl LayoutHistory {
changed = true; changed = true;
} }
// Update hashes if there are changes
if changed {
self.update_hashes();
}
changed changed
} }

View file

@ -51,7 +51,7 @@ impl LayoutManager {
let persist_cluster_layout: Persister<LayoutHistory> = let persist_cluster_layout: Persister<LayoutHistory> =
Persister::new(&config.metadata_dir, "cluster_layout"); Persister::new(&config.metadata_dir, "cluster_layout");
let cluster_layout = match persist_cluster_layout.load() { let mut cluster_layout = match persist_cluster_layout.load() {
Ok(x) => { Ok(x) => {
if x.current().replication_factor != replication_factor { if x.current().replication_factor != replication_factor {
return Err(Error::Message(format!( return Err(Error::Message(format!(
@ -71,6 +71,8 @@ impl LayoutManager {
} }
}; };
cluster_layout.update_trackers(node_id.into());
let layout = Arc::new(RwLock::new(cluster_layout)); let layout = Arc::new(RwLock::new(cluster_layout));
let change_notify = Arc::new(Notify::new()); let change_notify = Arc::new(Notify::new());
@ -126,7 +128,7 @@ impl LayoutManager {
if prev_layout_check && layout.check().is_err() { if prev_layout_check && layout.check().is_err() {
panic!("Merged two correct layouts and got an incorrect layout."); panic!("Merged two correct layouts and got an incorrect layout.");
} }
layout.update_trackers(self.node_id);
return Some(layout.clone()); return Some(layout.clone());
} }
} }
@ -137,6 +139,7 @@ impl LayoutManager {
let mut layout = self.layout.write().unwrap(); let mut layout = self.layout.write().unwrap();
if layout.update_trackers != *adv { if layout.update_trackers != *adv {
if layout.update_trackers.merge(adv) { if layout.update_trackers.merge(adv) {
layout.update_trackers(self.node_id);
return Some(layout.update_trackers.clone()); return Some(layout.update_trackers.clone());
} }
} }

View file

@ -3,6 +3,7 @@ use std::fmt;
use bytesize::ByteSize; use bytesize::ByteSize;
use garage_util::crdt::{AutoCrdt, Crdt}; use garage_util::crdt::{AutoCrdt, Crdt};
use garage_util::data::Uuid;
mod v08 { mod v08 {
use crate::layout::CompactNodeType; use crate::layout::CompactNodeType;
@ -276,8 +277,7 @@ mod v010 {
let update_tracker = UpdateTracker( let update_tracker = UpdateTracker(
version version
.nongateway_nodes() .nongateway_nodes()
.iter() .map(|x| (x, version.version))
.map(|x| (*x, version.version))
.collect::<HashMap<Uuid, u64>>(), .collect::<HashMap<Uuid, u64>>(),
); );
let staging = LayoutStaging { let staging = LayoutStaging {
@ -375,8 +375,15 @@ impl UpdateTracker {
changed changed
} }
pub(crate) fn min(&self) -> u64 { pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) {
self.0.iter().map(|(_, v)| *v).min().unwrap_or(0) match self.0.get_mut(&peer) {
Some(e) => {
*e = std::cmp::max(*e, value);
}
None => {
self.0.insert(peer, value);
}
}
} }
} }

View file

@ -134,15 +134,14 @@ impl LayoutVersion {
// ===================== internal information extractors ====================== // ===================== internal information extractors ======================
/// Returns the uuids of the non_gateway nodes in self.node_id_vec. /// Returns the uuids of the non_gateway nodes in self.node_id_vec.
pub(crate) fn nongateway_nodes(&self) -> Vec<Uuid> { pub(crate) fn nongateway_nodes(&self) -> impl Iterator<Item = Uuid> + '_ {
let mut result = Vec::<Uuid>::new(); self.node_id_vec
for uuid in self.node_id_vec.iter() { .iter()
match self.node_role(uuid) { .copied()
Some(role) if role.capacity.is_some() => result.push(*uuid), .filter(move |uuid| match self.node_role(uuid) {
_ => (), Some(role) if role.capacity.is_some() => true,
} _ => false,
} })
result
} }
/// Given a node uuids, this function returns the label of its zone /// Given a node uuids, this function returns the label of its zone
@ -158,8 +157,8 @@ impl LayoutVersion {
/// Returns the sum of capacities of non gateway nodes in the cluster /// Returns the sum of capacities of non gateway nodes in the cluster
fn get_total_capacity(&self) -> Result<u64, Error> { fn get_total_capacity(&self) -> Result<u64, Error> {
let mut total_capacity = 0; let mut total_capacity = 0;
for uuid in self.nongateway_nodes().iter() { for uuid in self.nongateway_nodes() {
total_capacity += self.get_node_capacity(uuid)?; total_capacity += self.get_node_capacity(&uuid)?;
} }
Ok(total_capacity) Ok(total_capacity)
} }
@ -320,7 +319,7 @@ impl LayoutVersion {
// to use them as indices in the flow graphs. // to use them as indices in the flow graphs.
let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?;
let nb_nongateway_nodes = self.nongateway_nodes().len(); let nb_nongateway_nodes = self.nongateway_nodes().count();
if nb_nongateway_nodes < self.replication_factor { if nb_nongateway_nodes < self.replication_factor {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"The number of nodes with positive \ "The number of nodes with positive \
@ -479,7 +478,8 @@ impl LayoutVersion {
let mut id_to_zone = Vec::<String>::new(); let mut id_to_zone = Vec::<String>::new();
let mut zone_to_id = HashMap::<String, usize>::new(); let mut zone_to_id = HashMap::<String, usize>::new();
for uuid in self.nongateway_nodes().iter() { let nongateway_nodes = self.nongateway_nodes().collect::<Vec<_>>();
for uuid in nongateway_nodes.iter() {
let r = self.node_role(uuid).unwrap(); let r = self.node_role(uuid).unwrap();
if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() {
zone_to_id.insert(r.zone.clone(), id_to_zone.len()); zone_to_id.insert(r.zone.clone(), id_to_zone.len());
@ -556,8 +556,10 @@ impl LayoutVersion {
exclude_assoc: &HashSet<(usize, usize)>, exclude_assoc: &HashSet<(usize, usize)>,
zone_redundancy: usize, zone_redundancy: usize,
) -> Result<Graph<FlowEdge>, Error> { ) -> Result<Graph<FlowEdge>, Error> {
let vertices = let vertices = LayoutVersion::generate_graph_vertices(
LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); zone_to_id.len(),
self.nongateway_nodes().count(),
);
let mut g = Graph::<FlowEdge>::new(&vertices); let mut g = Graph::<FlowEdge>::new(&vertices);
let nb_zones = zone_to_id.len(); let nb_zones = zone_to_id.len();
for p in 0..NB_PARTITIONS { for p in 0..NB_PARTITIONS {
@ -576,7 +578,7 @@ impl LayoutVersion {
)?; )?;
} }
} }
for n in 0..self.nongateway_nodes().len() { for n in 0..self.nongateway_nodes().count() {
let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?;
let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?];
g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?;
@ -600,7 +602,7 @@ impl LayoutVersion {
// previous assignment // previous assignment
let mut exclude_edge = HashSet::<(usize, usize)>::new(); let mut exclude_edge = HashSet::<(usize, usize)>::new();
if let Some(prev_assign) = prev_assign_opt { if let Some(prev_assign) = prev_assign_opt {
let nb_nodes = self.nongateway_nodes().len(); let nb_nodes = self.nongateway_nodes().count();
for (p, prev_assign_p) in prev_assign.iter().enumerate() { for (p, prev_assign_p) in prev_assign.iter().enumerate() {
for n in 0..nb_nodes { for n in 0..nb_nodes {
exclude_edge.insert((p, n)); exclude_edge.insert((p, n));
@ -652,7 +654,7 @@ impl LayoutVersion {
// We compute the maximal length of a simple path in gflow. It is used in the // We compute the maximal length of a simple path in gflow. It is used in the
// Bellman-Ford algorithm in optimize_flow_with_cost to set the number // Bellman-Ford algorithm in optimize_flow_with_cost to set the number
// of iterations. // of iterations.
let nb_nodes = self.nongateway_nodes().len(); let nb_nodes = self.nongateway_nodes().count();
let path_length = 4 * nb_nodes; let path_length = 4 * nb_nodes;
gflow.optimize_flow_with_cost(&cost, path_length)?; gflow.optimize_flow_with_cost(&cost, path_length)?;
@ -730,7 +732,7 @@ impl LayoutVersion {
} }
// We define and fill in the following tables // We define and fill in the following tables
let storing_nodes = self.nongateway_nodes(); let storing_nodes = self.nongateway_nodes().collect::<Vec<_>>();
let mut new_partitions = vec![0; storing_nodes.len()]; let mut new_partitions = vec![0; storing_nodes.len()];
let mut stored_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()];
@ -873,9 +875,9 @@ mod tests {
for z in zones.iter() { for z in zones.iter() {
zone_token.insert(z.clone(), 0); zone_token.insert(z.clone(), 0);
} }
for uuid in cl.nongateway_nodes().iter() { for uuid in cl.nongateway_nodes() {
let z = cl.get_node_zone(uuid)?; let z = cl.get_node_zone(&uuid)?;
let c = cl.get_node_capacity(uuid)?; let c = cl.get_node_capacity(&uuid)?;
zone_token.insert( zone_token.insert(
z.clone(), z.clone(),
zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize),