From fe9af1dcaae31a117528a9cfa10c422c9a850201 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 17:49:06 +0100 Subject: WIP: garage_rpc: store layout version history --- src/rpc/layout/version.rs | 1052 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1052 insertions(+) create mode 100644 src/rpc/layout/version.rs (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs new file mode 100644 index 00000000..363bc204 --- /dev/null +++ b/src/rpc/layout/version.rs @@ -0,0 +1,1052 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt; + +use bytesize::ByteSize; +use itertools::Itertools; + +use garage_util::crdt::{AutoCrdt, LwwMap}; +use garage_util::data::*; +use garage_util::error::*; + +use crate::graph_algo::*; + +use std::convert::TryInto; + +use super::schema::*; +use super::*; + +// The Message type will be used to collect information on the algorithm. +pub type Message = Vec; + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl LayoutVersion { + pub fn new(replication_factor: usize) -> Self { + // We set the default zone redundancy to be Maximum, meaning that the maximum + // possible value will be used depending on the cluster topology + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + LayoutVersion { + version: 0, + replication_factor, + partition_size: 0, + roles: LwwMap::new(), + node_id_vec: Vec::new(), + ring_assignment_data: Vec::new(), + parameters, + } + } + + // ===================== accessors ====================== + + /// Returns a list of IDs of nodes that currently have + /// a role in the cluster + pub fn node_ids(&self) -> &[Uuid] { + &self.node_id_vec[..] + } + + pub fn num_nodes(&self) -> usize { + self.node_id_vec.len() + } + + /// Returns the role of a node in the layout + pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { + match self.roles.get(node) { + Some(NodeRoleV(Some(v))) => Some(v), + _ => None, + } + } + + /// Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + + /// Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignment_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + + /// Get the partition in which data would fall on + pub fn partition_of(&self, position: &Hash) -> Partition { + let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); + top >> (16 - PARTITION_BITS) + } + + /// Get the list of partitions and the first hash of a partition key that would fall in it + pub fn partitions(&self) -> Vec<(Partition, Hash)> { + (0..(1 << PARTITION_BITS)) + .map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) + .collect::>() + } + + /// Walk the ring to find the n servers in which data should be replicated + pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + assert_eq!(n, self.replication_factor); + + let data = &self.ring_assignment_data; + + if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + warn!("Ring not yet ready, read/writes will be lost!"); + return vec![]; + } + + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + let partition_nodes = &data[partition_start..partition_end]; + + partition_nodes + .iter() + .map(|i| self.node_id_vec[*i as usize]) + .collect::>() + } + + // ===================== internal information extractors ====================== + + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub(crate) fn nongateway_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => result.push(*uuid), + _ => (), + } + } + result + } + + /// Given a node uuids, this function returns the label of its zone + fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + match self.node_role(uuid) { + Some(role) => Ok(&role.zone), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + /// Returns the sum of capacities of non gateway nodes in the cluster + fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.nongateway_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } + + /// Returns the effective value of the zone_redundancy parameter + fn effective_zone_redundancy(&self) -> usize { + match self.parameters.zone_redundancy { + ZoneRedundancy::AtLeast(v) => v, + ZoneRedundancy::Maximum => { + let n_zones = self + .roles + .items() + .iter() + .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) + .collect::>() + .len(); + std::cmp::min(n_zones, self.replication_factor) + } + } + } + + /// Check a cluster layout for internal consistency + /// (assignment, roles, parameters, partition size) + /// returns true if consistent, false if error + pub fn check(&self) -> Result<(), String> { + // Check that node_id_vec contains the correct list of nodes + let mut expected_nodes = self + .roles + .items() + .iter() + .filter(|(_, _, v)| v.0.is_some()) + .map(|(id, _, _)| *id) + .collect::>(); + expected_nodes.sort(); + let mut node_id_vec = self.node_id_vec.clone(); + node_id_vec.sort(); + if expected_nodes != node_id_vec { + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); + } + + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); + } + + // Check that the assigned nodes are correct identifiers + // of nodes that are assigned a role + // and that role is not the role of a gateway nodes + for x in self.ring_assignment_data.iter() { + if *x as usize >= self.node_id_vec.len() { + return Err(format!( + "ring_assignment_data contains invalid node id {}", + *x + )); + } + let node = self.node_id_vec[*x as usize]; + match self.roles.get(&node) { + Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), + _ => return Err("ring_assignment_data contains id of a gateway node".into()), + } + } + + // Check that every partition is associated to distinct nodes + let zone_redundancy = self.effective_zone_redundancy(); + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return Err(format!("partition does not contain {} unique node ids", rf)); + } + // Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::>(); + if zones_of_p.iter().unique().count() < zone_redundancy { + return Err(format!( + "nodes of partition are in less than {} distinct zones", + zone_redundancy + )); + } + } + + // Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignment_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); + } + } + } + + // Check that the partition size stored is the one computed by the asignation + // algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); + match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), + _ => (), + } + + Ok(()) + } + + // ================== updates to layout, internals =================== + + /// This function calculates a new partition-to-node assignment. + /// The computed assignment respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignment, it minimizes the distance to + /// the former assignment (if any) to minimize the amount of + /// data to be moved. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. + pub(crate) fn calculate_partition_assignment(&mut self) -> Result { + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignment reframed with new ids + let old_assignment_opt = self.update_node_id_vec()?; + + let zone_redundancy = self.effective_zone_redundancy(); + + let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); + msg.push(format!( + "Partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, zone_redundancy + )); + + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; + + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_nongateway_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < zone_redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len(), + zone_redundancy + ))); + } + + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; + + msg.push("".into()); + if old_assignment_opt.is_some() { + msg.push(format!( + "Optimal partition size: {} ({} in previous layout)", + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) + )); + } else { + msg.push(format!( + "Optimal partition size: {}", + ByteSize::b(partition_size).to_string_as(false) + )); + } + // We write the partition size. + self.partition_size = partition_size; + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" + .into(), + ); + } + + // We compute a first flow/assignment that is heuristically close to the previous + // assignment + let mut gflow = + self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; + if let Some(assoc) = &old_assignment_opt { + // We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } + + // We display statistics of the computation + msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); + + // We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + + if let Err(e) = self.check() { + return Err(Error::Message( + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) + )); + } + + Ok(msg) + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignment given by ring, with the new indices of the nodes, and + /// None if the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignment + /// do modify assignment_ring and node_id_vec. + fn update_node_id_vec(&mut self) -> Result>>, Error> { + // (1) We compute the new node list + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids + let new_non_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); + } + + let new_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + // We rewrite the old association with the new indices. We only consider partition + // to node assignments where the node is still in use. + if self.ring_assignment_data.is_empty() { + // This is a new association + return Ok(None); + } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "The old assignment does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + // We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); + } + + let mut old_assignment = vec![Vec::::new(); NB_PARTITIONS]; + let rf = self.replication_factor; + + for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { + for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } + + // We write the ring + self.ring_assignment_data = Vec::::new(); + + Ok(Some(old_assignment)) + } + + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. + fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.nongateway_nodes().iter() { + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + Ok((id_to_zone, zone_to_id)) + } + + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap, + zone_redundancy: usize, + ) -> Result { + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph( + (s_down + s_up) / 2, + zone_to_id, + &empty_set, + zone_redundancy, + )?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } + + Ok(s_down) + } + + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } + + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignment. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignment. This produces a solution that heuristically should be close to the + /// previous one. + fn generate_flow_graph( + &self, + partition_size: u64, + zone_to_id: &HashMap, + exclude_assoc: &HashSet<(usize, usize)>, + zone_redundancy: usize, + ) -> Result, Error> { + let vertices = + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let mut g = Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - zone_redundancy) as u64, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u64, + )?; + } + } + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + Ok(g) + } + + /// This function computes a first optimal assignment (in the form of a flow graph). + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap, + prev_assign_opt: &Option>>, + zone_redundancy: usize, + ) -> Result, Error> { + // We list the (partition,node) associations that are not used in the + // previous assignment + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in prev_assign_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } + + // We compute the best flow using only the edges used in the previous assignment + let mut g = self.generate_flow_graph( + self.partition_size, + zone_to_id, + &exclude_edge, + zone_redundancy, + )?; + g.compute_maximal_flow()?; + + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) + } + + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignment and the previous one + fn minimize_rebalance_load( + &self, + gflow: &mut Graph, + zone_to_id: &HashMap, + prev_assign: &[Vec], + ) -> Result<(), Error> { + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignments. + let mut cost = CostFunction::new(); + for (p, assoc_p) in prev_assign.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); + } + } + + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. + let nb_nodes = self.nongateway_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + Ok(()) + } + + /// This function updates the assignment ring from the flow graph. + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph, + ) -> Result<(), Error> { + self.ring_assignment_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignment_data.push((*n).try_into().unwrap()); + } + } + } + } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) + } + + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignment computation. + fn output_stat( + &self, + gflow: &Graph, + prev_assign_opt: &Option>>, + zone_to_id: &HashMap, + id_to_zone: &[String], + ) -> Result { + let mut msg = Message::new(); + + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push(format!( + "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap + )); + msg.push(format!( + "Effective capacity (replication factor {}): {}", + self.replication_factor, + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) + )); + if percent_cap < 80. { + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ + storage capacities." + .into(), + ); + msg.push( + "You might want to move storage capacity between zones or relax the redundancy constraint." + .into(), + ); + msg.push( + "See the detailed statistics below and look for saturated nodes/zones.".into(), + ); + } + + // We define and fill in the following tables + let storing_nodes = self.nongateway_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..NB_PARTITIONS { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(prev_assign) = prev_assign_opt { + let mut old_zones_of_p = Vec::::new(); + for n in prev_assign[p].iter() { + old_zones_of_p + .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if prev_assign_opt.is_none() { + new_partitions = stored_partitions.clone(); + //new_partitions_zone = stored_partitions_zone.clone(); + } + + // We display the statistics + + msg.push("".into()); + if prev_assign_opt.is_some() { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + msg.push("".into()); + } + + let mut table = vec![]; + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + table.push(format!( + "{}\tTags\tPartitions\tCapacity\tUsable capacity", + id_to_zone[z] + )); + + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); + table.push(format!( + " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", + self.node_id_vec[*n], + tags_n, + stored_partitions[*n], + new_partitions[*n], + ByteSize::b(total_cap_n).to_string_as(false), + ByteSize::b(available_cap_n).to_string_as(false), + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + )); + } + + table.push(format!( + " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", + replicated_partitions, + stored_partitions_zone[z], + //new_partitions_zone[z], + ByteSize::b(total_cap_z).to_string_as(false), + ByteSize::b(available_cap_z).to_string_as(false), + percent_cap_z + )); + table.push("".into()); + } + msg.push(format_table::format_table_to_string(table)); + + Ok(msg) + } +} + +// ==================================================================================== + +#[cfg(test)] +mod tests { + use super::{Error, *}; + use std::cmp::min; + + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignment + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &LayoutVersion) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } + + // For every partition, we count the number of zone already associated and + // the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.effective_zone_redundancy(); + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } + + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } + } + + fn update_layout( + cl: &mut LayoutVersion, + node_id_vec: &Vec, + node_capacity_vec: &Vec, + node_zone_vec: &Vec, + zone_redundancy: usize, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); + } + + let update = cl.staging_roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.staging_roles.merge(&update); + } + cl.staging_parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); + cl.staging_hash = cl.calculate_staging_hash(); + } + + #[test] + fn test_assignment() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = LayoutVersion::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + } +} -- cgit v1.2.3 From 523d2ecb9511f74e144cd116b942d6c1bf0f546d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 11:19:43 +0100 Subject: layout: use separate CRDT for staged layout changes --- src/rpc/layout/version.rs | 54 +++-------------------------------------------- 1 file changed, 3 insertions(+), 51 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 363bc204..6918fdf9 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -1,69 +1,21 @@ use std::collections::HashMap; use std::collections::HashSet; -use std::fmt; +use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::{AutoCrdt, LwwMap}; +use garage_util::crdt::LwwMap; use garage_util::data::*; use garage_util::error::*; -use crate::graph_algo::*; - -use std::convert::TryInto; - +use super::graph_algo::*; use super::schema::*; use super::*; // The Message type will be used to collect information on the algorithm. pub type Message = Vec; -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - impl LayoutVersion { pub fn new(replication_factor: usize) -> Self { // We set the default zone redundancy to be Maximum, meaning that the maximum -- cgit v1.2.3 From 03ebf18830dff1983f09abe6ecb8d8d26daeb446 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:31:59 +0100 Subject: layout: begin managing the update tracker values --- src/rpc/layout/version.rs | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 6918fdf9..65c62f63 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -134,15 +134,14 @@ impl LayoutVersion { // ===================== internal information extractors ====================== /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub(crate) fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result + pub(crate) fn nongateway_nodes(&self) -> impl Iterator + '_ { + self.node_id_vec + .iter() + .copied() + .filter(move |uuid| match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => true, + _ => false, + }) } /// Given a node uuids, this function returns the label of its zone @@ -158,8 +157,8 @@ impl LayoutVersion { /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; - for uuid in self.nongateway_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; + for uuid in self.nongateway_nodes() { + total_capacity += self.get_node_capacity(&uuid)?; } Ok(total_capacity) } @@ -320,7 +319,7 @@ impl LayoutVersion { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().len(); + let nb_nongateway_nodes = self.nongateway_nodes().count(); if nb_nongateway_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ @@ -479,7 +478,8 @@ impl LayoutVersion { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - for uuid in self.nongateway_nodes().iter() { + let nongateway_nodes = self.nongateway_nodes().collect::>(); + for uuid in nongateway_nodes.iter() { let r = self.node_role(uuid).unwrap(); if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); @@ -556,8 +556,10 @@ impl LayoutVersion { exclude_assoc: &HashSet<(usize, usize)>, zone_redundancy: usize, ) -> Result, Error> { - let vertices = - LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let vertices = LayoutVersion::generate_graph_vertices( + zone_to_id.len(), + self.nongateway_nodes().count(), + ); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -576,7 +578,7 @@ impl LayoutVersion { )?; } } - for n in 0..self.nongateway_nodes().len() { + for n in 0..self.nongateway_nodes().count() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; @@ -600,7 +602,7 @@ impl LayoutVersion { // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().len(); + let nb_nodes = self.nongateway_nodes().count(); for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); @@ -652,7 +654,7 @@ impl LayoutVersion { // We compute the maximal length of a simple path in gflow. It is used in the // Bellman-Ford algorithm in optimize_flow_with_cost to set the number // of iterations. - let nb_nodes = self.nongateway_nodes().len(); + let nb_nodes = self.nongateway_nodes().count(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -730,7 +732,7 @@ impl LayoutVersion { } // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes(); + let storing_nodes = self.nongateway_nodes().collect::>(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -873,9 +875,9 @@ mod tests { for z in zones.iter() { zone_token.insert(z.clone(), 0); } - for uuid in cl.nongateway_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; + for uuid in cl.nongateway_nodes() { + let z = cl.get_node_zone(&uuid)?; + let c = cl.get_node_capacity(&uuid)?; zone_token.insert( z.clone(), zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), -- cgit v1.2.3 From df36cf3099f6010c4fc62109b85d4d1e62f160cc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 16:32:31 +0100 Subject: layout: add helpers to LayoutHistory and prepare integration with Table --- src/rpc/layout/version.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 65c62f63..8133672a 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -109,7 +109,7 @@ impl LayoutVersion { .collect::>() } - /// Walk the ring to find the n servers in which data should be replicated + /// Return the n servers in which data for this hash should be replicated pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { assert_eq!(n, self.replication_factor); -- cgit v1.2.3 From ce89d1ddabe3b9e638b0173949726522ae9a0311 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 12:08:32 +0100 Subject: table sync: adapt to new layout history --- src/rpc/layout/version.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 8133672a..f45a3c35 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -98,15 +98,13 @@ impl LayoutVersion { } /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); - (i as u16, Hash::from(location)) - }) - .collect::>() + pub fn partitions(&self) -> impl Iterator + '_ { + (0..(1 << PARTITION_BITS)).map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) } /// Return the n servers in which data for this hash should be replicated -- cgit v1.2.3 From 9a491fa1372a23e91c793ee1d2b313607752826a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 13:10:59 +0100 Subject: layout: fix test --- src/rpc/layout/version.rs | 172 ++-------------------------------------------- 1 file changed, 5 insertions(+), 167 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index f45a3c35..ffbdf277 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -143,7 +143,7 @@ impl LayoutVersion { } /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { match self.node_role(uuid) { Some(role) => Ok(&role.zone), _ => Err(Error::Message( @@ -162,7 +162,7 @@ impl LayoutVersion { } /// Returns the effective value of the zone_redundancy parameter - fn effective_zone_redundancy(&self) -> usize { + pub(crate) fn effective_zone_redundancy(&self) -> usize { match self.parameters.zone_redundancy { ZoneRedundancy::AtLeast(v) => v, ZoneRedundancy::Maximum => { @@ -472,7 +472,9 @@ impl LayoutVersion { /// This function generates ids for the zone of the nodes appearing in /// self.node_id_vec. - fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + pub(crate) fn generate_nongateway_zone_ids( + &self, + ) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); @@ -838,167 +840,3 @@ impl LayoutVersion { Ok(msg) } } - -// ==================================================================================== - -#[cfg(test)] -mod tests { - use super::{Error, *}; - use std::cmp::min; - - // This function checks that the partition size S computed is at least better than the - // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignment - // was not optimal. The naive algorithm is the following : - // - we compute the max number of partitions associated to every node, capped at the - // partition number. It gives the number of tokens of every node. - // - every zone has a number of tokens equal to the sum of the tokens of its nodes. - // - we cycle over the partitions and associate zone tokens while respecting the - // zone redundancy constraint. - // NOTE: the naive algorithm is not optimal. Counter example: - // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - // With these parameters, the naive algo fails, whereas there is a solution: - // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &LayoutVersion) -> Result { - let over_size = cl.partition_size + 1; - let mut zone_token = HashMap::::new(); - - let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.nongateway_nodes() { - let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid)?; - zone_token.insert( - z.clone(), - zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), - ); - } - - // For every partition, we count the number of zone already associated and - // the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z, t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; NB_PARTITIONS]; - let mut last_zone = vec![zones.len(); NB_PARTITIONS]; - - let mut curr_zone = 0; - - let redundancy = cl.effective_zone_redundancy(); - - for replic in 0..cl.replication_factor { - for p in 0..NB_PARTITIONS { - while id_zone_token[curr_zone] == 0 - || (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) - { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg: &Message) { - for s in msg.iter() { - println!("{}", s); - } - } - - fn update_layout( - cl: &mut LayoutVersion, - node_id_vec: &Vec, - node_capacity_vec: &Vec, - node_zone_vec: &Vec, - zone_redundancy: usize, - ) { - for i in 0..node_id_vec.len() { - if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { - cl.node_id_vec.push(x); - } - - let update = cl.staging_roles.update_mutator( - cl.node_id_vec[i], - NodeRoleV(Some(NodeRole { - zone: (node_zone_vec[i].to_string()), - capacity: (Some(node_capacity_vec[i])), - tags: (vec![]), - })), - ); - cl.staging_roles.merge(&update); - } - cl.staging_parameters.update(LayoutParameters { - zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), - }); - cl.staging_hash = cl.calculate_staging_hash(); - } - - #[test] - fn test_assignment() { - let mut node_id_vec = vec![1, 2, 3]; - let mut node_capacity_vec = vec![4000, 1000, 2000]; - let mut node_zone_vec = vec!["A", "B", "C"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let mut cl = LayoutVersion::new(3); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![ - 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, - ]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - let v = cl.version; - let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - } -} -- cgit v1.2.3 From 8e292e06b3fde1d3b5b019a26eabd4f0d9ac22c3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 12:48:38 +0100 Subject: layout: some refactoring of nongateway nodes --- src/rpc/layout/version.rs | 98 ++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 44 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index ffbdf277..a7f387b6 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -5,7 +5,7 @@ use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::LwwMap; +use garage_util::crdt::{Crdt, LwwMap}; use garage_util::data::*; use garage_util::error::*; @@ -30,6 +30,7 @@ impl LayoutVersion { partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), + nongateway_node_count: 0, ring_assignment_data: Vec::new(), parameters, } @@ -43,6 +44,11 @@ impl LayoutVersion { &self.node_id_vec[..] } + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn nongateway_nodes(&self) -> &[Uuid] { + &self.node_id_vec[..self.nongateway_node_count] + } + pub fn num_nodes(&self) -> usize { self.node_id_vec.len() } @@ -56,18 +62,14 @@ impl LayoutVersion { } /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid: &Uuid) -> Option { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), zone: _, tags: _, - }) => Ok(*cap), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )), + }) => Some(*cap), + _ => None, } } @@ -131,17 +133,6 @@ impl LayoutVersion { // ===================== internal information extractors ====================== - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub(crate) fn nongateway_nodes(&self) -> impl Iterator + '_ { - self.node_id_vec - .iter() - .copied() - .filter(move |uuid| match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => true, - _ => false, - }) - } - /// Given a node uuids, this function returns the label of its zone pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { match self.node_role(uuid) { @@ -152,11 +143,16 @@ impl LayoutVersion { } } + fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + self.get_node_capacity(&uuid) + .expect("non-gateway node with zero capacity") + } + /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { - total_capacity += self.get_node_capacity(&uuid)?; + total_capacity += self.expect_get_node_capacity(&uuid); } Ok(total_capacity) } @@ -257,7 +253,7 @@ impl LayoutVersion { if *usage > 0 { let uuid = self.node_id_vec[n]; let partusage = usage * self.partition_size; - let nodecap = self.get_node_capacity(&uuid).unwrap(); + let nodecap = self.expect_get_node_capacity(&uuid); if partusage > nodecap { return Err(format!( "node usage ({}) is bigger than node capacity ({})", @@ -288,6 +284,21 @@ impl LayoutVersion { // ================== updates to layout, internals =================== + pub(crate) fn calculate_next_version( + mut self, + staging: &LayoutStaging, + ) -> Result<(Self, Message), Error> { + self.version += 1; + + self.roles.merge(&staging.roles); + self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = *staging.parameters.get(); + + let msg = self.calculate_partition_assignment()?; + + Ok((self, msg)) + } + /// This function calculates a new partition-to-node assignment. /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a @@ -297,7 +308,7 @@ impl LayoutVersion { /// data to be moved. /// Staged role changes must be merged with nodes roles before calling this function, /// hence it must only be called from apply_staged_changes() and hence is not public. - pub(crate) fn calculate_partition_assignment(&mut self) -> Result { + fn calculate_partition_assignment(&mut self) -> Result { // We update the node ids, since the node role list might have changed with the // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; @@ -317,12 +328,12 @@ impl LayoutVersion { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().count(); - if nb_nongateway_nodes < self.replication_factor { + if self.nongateway_nodes().len() < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_nongateway_nodes, self.replication_factor + self.nongateway_nodes().len(), + self.replication_factor ))); } if id_to_zone.len() < zone_redundancy { @@ -420,12 +431,14 @@ impl LayoutVersion { .map(|(k, _, _)| *k) .collect(); - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.extend(new_non_gateway_nodes); - new_node_id_vec.extend(new_gateway_nodes); + let old_node_id_vec = std::mem::take(&mut self.node_id_vec); + + self.nongateway_node_count = new_non_gateway_nodes.len(); + self.node_id_vec.clear(); + self.node_id_vec.extend(new_non_gateway_nodes); + self.node_id_vec.extend(new_gateway_nodes); - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); + let new_node_id_vec = &self.node_id_vec; // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition @@ -464,7 +477,7 @@ impl LayoutVersion { } } - // We write the ring + // We clear the ring assignemnt data self.ring_assignment_data = Vec::::new(); Ok(Some(old_assignment)) @@ -478,8 +491,7 @@ impl LayoutVersion { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - let nongateway_nodes = self.nongateway_nodes().collect::>(); - for uuid in nongateway_nodes.iter() { + for uuid in self.nongateway_nodes().iter() { let r = self.node_role(uuid).unwrap(); if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); @@ -556,10 +568,8 @@ impl LayoutVersion { exclude_assoc: &HashSet<(usize, usize)>, zone_redundancy: usize, ) -> Result, Error> { - let vertices = LayoutVersion::generate_graph_vertices( - zone_to_id.len(), - self.nongateway_nodes().count(), - ); + let vertices = + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -578,8 +588,8 @@ impl LayoutVersion { )?; } } - for n in 0..self.nongateway_nodes().count() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { @@ -602,7 +612,7 @@ impl LayoutVersion { // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().count(); + let nb_nodes = self.nongateway_nodes().len(); for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); @@ -654,7 +664,7 @@ impl LayoutVersion { // We compute the maximal length of a simple path in gflow. It is used in the // Bellman-Ford algorithm in optimize_flow_with_cost to set the number // of iterations. - let nb_nodes = self.nongateway_nodes().count(); + let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -732,7 +742,7 @@ impl LayoutVersion { } // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes().collect::>(); + let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -804,13 +814,13 @@ impl LayoutVersion { let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]); } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; - let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]); let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); table.push(format!( " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", -- cgit v1.2.3 From 1aab1f4e688ebc3f3adcb41c817c16c688a3291c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:06:16 +0100 Subject: layout: refactoring of all_nodes --- src/rpc/layout/version.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index a7f387b6..2cbdcee2 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -38,22 +38,19 @@ impl LayoutVersion { // ===================== accessors ====================== - /// Returns a list of IDs of nodes that currently have - /// a role in the cluster - pub fn node_ids(&self) -> &[Uuid] { + /// Returns a list of IDs of nodes that have a role in this + /// version of the cluster layout, including gateway nodes + pub fn all_nodes(&self) -> &[Uuid] { &self.node_id_vec[..] } - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + /// Returns a list of IDs of nodes that have a storage capacity + /// assigned in this version of the cluster layout pub fn nongateway_nodes(&self) -> &[Uuid] { &self.node_id_vec[..self.nongateway_node_count] } - pub fn num_nodes(&self) -> usize { - self.node_id_vec.len() - } - - /// Returns the role of a node in the layout + /// Returns the role of a node in the layout, if it has one pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { match self.roles.get(node) { Some(NodeRoleV(Some(v))) => Some(v), @@ -61,7 +58,7 @@ impl LayoutVersion { } } - /// Given a node uuids, this function returns its capacity or fails if it does not have any + /// Returns the capacity of a node in the layout, if it has one pub fn get_node_capacity(&self, uuid: &Uuid) -> Option { match self.node_role(uuid) { Some(NodeRole { -- cgit v1.2.3 From 3b361d2959e3d577bdae6f8a5ccb0c9d5526b7ea Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 14:28:16 +0100 Subject: layout: prepare for write sets --- src/rpc/layout/version.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 2cbdcee2..912ee538 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -107,25 +107,24 @@ impl LayoutVersion { } /// Return the n servers in which data for this hash should be replicated - pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator + '_ { assert_eq!(n, self.replication_factor); let data = &self.ring_assignment_data; - if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) { + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + &data[partition_start..partition_end] + } else { warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition_start = partition_idx * self.replication_factor; - let partition_end = (partition_idx + 1) * self.replication_factor; - let partition_nodes = &data[partition_start..partition_end]; + &[] + }; partition_nodes .iter() - .map(|i| self.node_id_vec[*i as usize]) - .collect::>() + .map(move |i| self.node_id_vec[*i as usize]) } // ===================== internal information extractors ====================== -- cgit v1.2.3 From b3e729f4b8ec3b06593f8d3b161c76b1263d9f13 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 12:15:58 +0100 Subject: layout history merge: rm invalid versions when valid versions are added --- src/rpc/layout/version.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 912ee538..947fab56 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -174,6 +174,16 @@ impl LayoutVersion { /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> Result<(), String> { + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); + } + // Check that node_id_vec contains the correct list of nodes let mut expected_nodes = self .roles @@ -189,16 +199,6 @@ impl LayoutVersion { return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } - // Check that the assignment data has the correct length - let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignment_data.len() != expected_assignment_data_len { - return Err(format!( - "ring_assignment_data has incorrect length {} instead of {}", - self.ring_assignment_data.len(), - expected_assignment_data_len - )); - } - // Check that the assigned nodes are correct identifiers // of nodes that are assigned a role // and that role is not the role of a gateway nodes -- cgit v1.2.3 From 063294dd569e10c6d85e29eb6507249eece00956 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 11:50:58 +0100 Subject: layout version: refactor get_node_zone --- src/rpc/layout/version.rs | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 947fab56..cbfbee94 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -70,6 +70,14 @@ impl LayoutVersion { } } + /// Given a node uuids, this function returns the label of its zone if it has one + pub fn get_node_zone(&self, uuid: &Uuid) -> Option<&str> { + match self.node_role(uuid) { + Some(role) => Some(&role.zone), + _ => None, + } + } + /// Returns the number of partitions associated to this node in the ring pub fn get_node_usage(&self, uuid: &Uuid) -> Result { for (i, id) in self.node_id_vec.iter().enumerate() { @@ -129,28 +137,22 @@ impl LayoutVersion { // ===================== internal information extractors ====================== - /// Given a node uuids, this function returns the label of its zone - pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { - match self.node_role(uuid) { - Some(role) => Ok(&role.zone), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - - fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { self.get_node_capacity(&uuid) .expect("non-gateway node with zero capacity") } + pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { + self.get_node_zone(&uuid).expect("node without a zone") + } + /// Returns the sum of capacities of non gateway nodes in the cluster - fn get_total_capacity(&self) -> Result { + fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { total_capacity += self.expect_get_node_capacity(&uuid); } - Ok(total_capacity) + total_capacity } /// Returns the effective value of the zone_redundancy parameter @@ -227,10 +229,7 @@ impl LayoutVersion { // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p .iter() - .map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }) + .map(|n| self.expect_get_node_zone(&self.node_id_vec[*n as usize])) .collect::>(); if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( @@ -516,7 +515,7 @@ impl LayoutVersion { } let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; + let mut s_up = self.get_total_capacity(); while s_down + 1 < s_up { g = self.generate_flow_graph( (s_down + s_up) / 2, @@ -586,7 +585,7 @@ impl LayoutVersion { } for n in 0..self.nongateway_nodes().len() { let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[n])]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { @@ -632,7 +631,7 @@ impl LayoutVersion { // The algorithm is such that it will start with the flow that we just computed // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; @@ -652,7 +651,7 @@ impl LayoutVersion { let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } @@ -707,7 +706,7 @@ impl LayoutVersion { let mut msg = Message::new(); let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity()?; + let total_cap = self.get_total_capacity(); let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push(format!( "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", @@ -754,7 +753,7 @@ impl LayoutVersion { let mut old_zones_of_p = Vec::::new(); for n in prev_assign[p].iter() { old_zones_of_p - .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); + .push(zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]); } if !old_zones_of_p.contains(&z) { new_partitions_zone[z] += 1; @@ -796,7 +795,7 @@ impl LayoutVersion { for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::::new(); for n in 0..storing_nodes.len() { - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + if self.expect_get_node_zone(&self.node_id_vec[n]) == id_to_zone[z] { nodes_of_z.push(n); } } -- cgit v1.2.3 From 4dbf254512327ef4e7abbd5525b89bfa5b7ecb6f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 14:15:52 +0100 Subject: layout: refactoring, merge two files --- src/rpc/layout/version.rs | 1 - 1 file changed, 1 deletion(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index cbfbee94..5b307156 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -10,7 +10,6 @@ use garage_util::data::*; use garage_util::error::*; use super::graph_algo::*; -use super::schema::*; use super::*; // The Message type will be used to collect information on the algorithm. -- cgit v1.2.3 From 85b5a6bcd11c0a7651e4c589569e1935a3d18e46 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 15:31:47 +0100 Subject: fix some clippy lints --- src/rpc/layout/version.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/rpc/layout/version.rs') diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 5b307156..ee4b2821 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -137,19 +137,19 @@ impl LayoutVersion { // ===================== internal information extractors ====================== pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { - self.get_node_capacity(&uuid) + self.get_node_capacity(uuid) .expect("non-gateway node with zero capacity") } pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { - self.get_node_zone(&uuid).expect("node without a zone") + self.get_node_zone(uuid).expect("node without a zone") } /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { - total_capacity += self.expect_get_node_capacity(&uuid); + total_capacity += self.expect_get_node_capacity(uuid); } total_capacity } -- cgit v1.2.3