From 015ccb39aa511c72d0c899713a828491871da3e7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 11:57:36 +0200 Subject: new layout: make zone_redundancy optionnal (if not set, is maximum) --- src/rpc/layout.rs | 131 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 35 deletions(-) (limited to 'src/rpc') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 76d29b08..9aa9c584 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,6 +1,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; +use std::fmt; use bytesize::ByteSize; use itertools::Itertools; @@ -115,7 +116,16 @@ mod v09 { /// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] pub struct LayoutParameters { - pub zone_redundancy: usize, + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// If None, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, } impl garage_util::migrate::Migrate for ClusterLayout { @@ -125,7 +135,6 @@ mod v09 { fn migrate(previous: Self::Previous) -> Self { use itertools::Itertools; - use std::collections::HashSet; // In the old layout, capacities are in an arbitrary unit, // but in the new layout they are in bytes. @@ -152,17 +161,10 @@ mod v09 { .min() .unwrap_or(0); - // Determine zone redundancy parameter - let zone_redundancy = std::cmp::min( - previous.replication_factor, - roles - .items() - .iter() - .filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str())) - .collect::>() - .len(), - ); - let parameters = LayoutParameters { zone_redundancy }; + // By default, zone_redundancy is None (i.e. maximum possible value) + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; let mut res = Self { version: previous.version, @@ -224,13 +226,37 @@ impl NodeRole { } } +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + // Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be equal to the replication factor, - // i.e. as strict as possible. + // We set the default zone redundancy to be None, meaning that the maximum + // possible value will be used depending on the cluster topology let parameters = LayoutParameters { - zone_redundancy: replication_factor, + zone_redundancy: ZoneRedundancy::Maximum, }; let staging_parameters = Lww::::new(parameters.clone()); @@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show` Ok(total_capacity) } + /// Returns the effective value of the zone_redundancy parameter + fn effective_zone_redundancy(&self) -> usize { + match self.parameters.zone_redundancy { + ZoneRedundancy::AtLeast(v) => v, + ZoneRedundancy::Maximum => { + let n_zones = self + .roles + .items() + .iter() + .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) + .collect::>() + .len(); + std::cmp::min(n_zones, self.replication_factor) + } + } + } + /// Check a cluster layout for internal consistency /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error @@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } // Check that every partition is associated to distinct nodes + let zone_redundancy = self.effective_zone_redundancy(); let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); @@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show` .expect("Zone not found.") }) .collect::>(); - let redundancy = self.parameters.zone_redundancy; - if zones_of_p.iter().unique().count() < redundancy { + if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( "nodes of partition are in less than {} distinct zones", - redundancy + zone_redundancy )); } } @@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); - match cl2.compute_optimal_partition_size(&zone_to_id) { + match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { Ok(s) if s != self.partition_size => { return Err(format!( "partition_size ({}) is different than optimal value ({})", @@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } } +// ==================================================================================== + // Implementation of the ClusterLayout methods related to the assignment algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignment. @@ -549,13 +594,15 @@ impl ClusterLayout { // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; + let zone_redundancy = self.effective_zone_redundancy(); + let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("".into()); msg.push(format!( "Partitions are \ replicated {} times on at least {} distinct zones.", - self.replication_factor, self.parameters.zone_redundancy + self.replication_factor, zone_redundancy )); // We generate for once numerical ids for the zones of non gateway nodes, @@ -570,12 +617,12 @@ impl ClusterLayout { nb_nongateway_nodes, self.replication_factor ))); } - if id_to_zone.len() < self.parameters.zone_redundancy { + if id_to_zone.len() < zone_redundancy { return Err(Error::Message(format!( "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", id_to_zone.len(), - self.parameters.zone_redundancy + zone_redundancy ))); } @@ -583,7 +630,7 @@ impl ClusterLayout { // Capacities should be given in a unit so that partition size is at least 100. // In this case, integer rounding plays a marginal role in the percentages of // optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; msg.push("".into()); if old_assignment_opt != None { @@ -610,7 +657,8 @@ impl ClusterLayout { // We compute a first flow/assignment that is heuristically close to the previous // assignment - let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?; + let mut gflow = + self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; if let Some(assoc) = &old_assignment_opt { // We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; @@ -735,9 +783,10 @@ impl ClusterLayout { fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, + zone_redundancy: usize, ) -> Result { let empty_set = HashSet::<(usize, usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { return Err(Error::Message( @@ -750,7 +799,12 @@ impl ClusterLayout { let mut s_down = 1; let mut s_up = self.get_total_capacity()?; while s_down + 1 < s_up { - g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g = self.generate_flow_graph( + (s_down + s_up) / 2, + zone_to_id, + &empty_set, + zone_redundancy, + )?; g.compute_maximal_flow()?; if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { s_up = (s_down + s_up) / 2; @@ -789,18 +843,18 @@ impl ClusterLayout { partition_size: u64, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, + zone_redundancy: usize, ) -> Result, Error> { let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; g.add_edge( Vertex::Source, Vertex::Pdown(p), - (self.replication_factor - redundancy) as u64, + (self.replication_factor - zone_redundancy) as u64, )?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; @@ -829,6 +883,7 @@ impl ClusterLayout { &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, + zone_redundancy: usize, ) -> Result, Error> { // We list the (partition,node) associations that are not used in the // previous assignment @@ -846,7 +901,12 @@ impl ClusterLayout { } // We compute the best flow using only the edges used in the previous assignment - let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + let mut g = self.generate_flow_graph( + self.partition_size, + zone_to_id, + &exclude_edge, + zone_redundancy, + )?; g.compute_maximal_flow()?; // We add the excluded edges and compute the maximal flow with the full graph. @@ -997,7 +1057,7 @@ impl ClusterLayout { if *prev_assign_opt == None { new_partitions = stored_partitions.clone(); - new_partitions_zone = stored_partitions_zone.clone(); + //new_partitions_zone = stored_partitions_zone.clone(); } // We display the statistics @@ -1124,7 +1184,7 @@ mod tests { let mut curr_zone = 0; - let redundancy = cl.parameters.zone_redundancy; + let redundancy = cl.effective_zone_redundancy(); for replic in 0..cl.replication_factor { for p in 0..NB_PARTITIONS { @@ -1176,8 +1236,9 @@ mod tests { ); cl.staging_roles.merge(&update); } - cl.staging_parameters - .update(LayoutParameters { zone_redundancy }); + cl.staging_parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); cl.staging_hash = cl.calculate_staging_hash(); } -- cgit v1.2.3