From c1d1646c4d62300ec48503aa65623ee7e3df8685 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 09:54:19 +0200 Subject: Change the way new layout assignations are computed. The function now computes an optimal assignation (with respect to partition size) that minimizes the distance to the former assignation, using flow algorithms. This commit was written by Mendes Oulamara --- src/rpc/layout.rs | 881 +++++++++++++++++++++++++++--------------------------- 1 file changed, 447 insertions(+), 434 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index b9c02c21..afd7df17 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,10 +1,14 @@ use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; +use std::cmp::{min}; +use std::collections::{HashMap}; use serde::{Deserialize, Serialize}; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; +use garage_util::bipartite::*; + +use rand::prelude::SliceRandom; use crate::ring::*; @@ -164,445 +168,454 @@ impl ClusterLayout { true } - /// Calculate an assignation of partitions to nodes - pub fn calculate_partition_assignation(&mut self) -> bool { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); - - println!("Calculating updated partition assignation, this may take some time..."); - println!(); - - // Get old partition assignation - let old_partitions = self.parse_assignation_data(); - - // Start new partition assignation with nodes from old assignation where it is relevant - let mut partitions = old_partitions - .iter() - .map(|old_part| { - let mut new_part = PartitionAss::new(); - for node in old_part.nodes.iter() { - if let Some(role) = node.1 { - if role.capacity.is_some() { - new_part.add(None, n_zones, node.0, role); - } - } - } - new_part - }) - .collect::>(); - - // In various cases, not enough nodes will have been added for all partitions - // in the step above (e.g. due to node removals, or new zones being added). - // Here we add more nodes to make a complete (but sub-optimal) assignation, - // using an initial partition assignation that is calculated using the multi-dc maglev trick - match self.initial_partition_assignation() { - Some(initial_partitions) => { - for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) { - for (id, info) in ipart.nodes.iter() { - if part.nodes.len() < self.replication_factor { - part.add(None, n_zones, id, info.unwrap()); - } - } - assert!(part.nodes.len() == self.replication_factor); - } - } - None => { - // Not enough nodes in cluster to build a correct assignation. - // Signal it by returning an error. - return false; - } - } - - // Calculate how many partitions each node should ideally store, - // and how many partitions they are storing with the current assignation - // This defines our target for which we will optimize in the following loop. - let total_capacity = configured_nodes - .iter() - .map(|(_, info)| info.capacity.unwrap_or(0)) - .sum::() as usize; - let total_partitions = self.replication_factor * (1 << PARTITION_BITS); - let target_partitions_per_node = configured_nodes - .iter() - .map(|(id, info)| { - ( - *id, - info.capacity.unwrap_or(0) as usize * total_partitions / total_capacity, - ) - }) - .collect::>(); - - let mut partitions_per_node = self.partitions_per_node(&partitions[..]); - - println!("Target number of partitions per node:"); - for (node, npart) in target_partitions_per_node.iter() { - println!("{:?}\t{}", node, npart); - } - println!(); - - // Shuffle partitions between nodes so that nodes will reach (or better approach) - // their target number of stored partitions - loop { - let mut option = None; - for (i, part) in partitions.iter_mut().enumerate() { - for (irm, (idrm, _)) in part.nodes.iter().enumerate() { - let errratio = |node, parts| { - let tgt = *target_partitions_per_node.get(node).unwrap() as f32; - (parts - tgt) / tgt - }; - let square = |x| x * x; - - let partsrm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as f32; - - for (idadd, infoadd) in configured_nodes.iter() { - // skip replacing a node by itself - // and skip replacing by gateway nodes - if idadd == idrm || infoadd.capacity.is_none() { - continue; - } - - // We want to try replacing node idrm by node idadd - // if that brings us close to our goal. - let partsadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as f32; - let oldcost = square(errratio(*idrm, partsrm) - errratio(*idadd, partsadd)); - let newcost = - square(errratio(*idrm, partsrm - 1.) - errratio(*idadd, partsadd + 1.)); - if newcost >= oldcost { - // not closer to our goal - continue; - } - let gain = oldcost - newcost; - - let mut newpart = part.clone(); - - newpart.nodes.remove(irm); - if !newpart.add(None, n_zones, idadd, infoadd) { - continue; - } - assert!(newpart.nodes.len() == self.replication_factor); - - if !old_partitions[i] - .is_valid_transition_to(&newpart, self.replication_factor) - { - continue; - } - - if option - .as_ref() - .map(|(old_gain, _, _, _, _)| gain > *old_gain) - .unwrap_or(true) - { - option = Some((gain, i, idadd, idrm, newpart)); - } - } - } - } - if let Some((_gain, i, idadd, idrm, newpart)) = option { - *partitions_per_node.entry(idadd).or_insert(0) += 1; - *partitions_per_node.get_mut(idrm).unwrap() -= 1; - partitions[i] = newpart; - } else { - break; - } - } - // Check we completed the assignation correctly - // (this is a set of checks for the algorithm's consistency) - assert!(partitions.len() == (1 << PARTITION_BITS)); - assert!(partitions - .iter() - .all(|p| p.nodes.len() == self.replication_factor)); - - let new_partitions_per_node = self.partitions_per_node(&partitions[..]); - assert!(new_partitions_per_node == partitions_per_node); - - // Show statistics - println!("New number of partitions per node:"); - for (node, npart) in partitions_per_node.iter() { - let tgt = *target_partitions_per_node.get(node).unwrap(); - let pct = 100f32 * (*npart as f32) / (tgt as f32); - println!("{:?}\t{}\t({}% of {})", node, npart, pct as i32, tgt); - } - println!(); - - let mut diffcount = HashMap::new(); - for (oldpart, newpart) in old_partitions.iter().zip(partitions.iter()) { - let nminus = oldpart.txtplus(newpart); - let nplus = newpart.txtplus(oldpart); - if nminus != "[...]" || nplus != "[...]" { - let tup = (nminus, nplus); - *diffcount.entry(tup).or_insert(0) += 1; - } - } - if diffcount.is_empty() { - println!("No data will be moved between nodes."); - } else { - let mut diffcount = diffcount.into_iter().collect::>(); - diffcount.sort(); - println!("Number of partitions that move:"); - for ((nminus, nplus), npart) in diffcount { - println!("\t{}\t{} -> {}", npart, nminus, nplus); - } - } - println!(); - - // Calculate and save new assignation data - let (nodes, assignation_data) = - self.compute_assignation_data(&configured_nodes[..], &partitions[..]); - - self.node_id_vec = nodes; - self.ring_assignation_data = assignation_data; - - true - } - - fn initial_partition_assignation(&self) -> Option>> { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); - - // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) - let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::>(); - - // Prepare ring - let mut partitions: Vec = partitions_idx - .iter() - .map(|_i| PartitionAss::new()) - .collect::>(); - - // Create MagLev priority queues for each node - let mut queues = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(node_id, node_info)| { - let mut parts = partitions_idx - .iter() - .map(|i| { - let part_data = - [&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat(); - (*i, fasthash(&part_data[..])) - }) - .collect::>(); - parts.sort_by_key(|(_i, h)| *h); - let parts_i = parts.iter().map(|(i, _h)| *i).collect::>(); - (node_id, node_info, parts_i, 0) - }) - .collect::>(); - - let max_capacity = configured_nodes - .iter() - .filter_map(|(_, node_info)| node_info.capacity) - .fold(0, std::cmp::max); - - // Fill up ring - for rep in 0..self.replication_factor { - queues.sort_by_key(|(ni, _np, _q, _p)| { - let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat(); - fasthash(&queue_data[..]) - }); - - for (_, _, _, pos) in queues.iter_mut() { - *pos = 0; - } - - let mut remaining = partitions_idx.len(); - while remaining > 0 { - let remaining0 = remaining; - for i_round in 0..max_capacity { - for (node_id, node_info, q, pos) in queues.iter_mut() { - if i_round >= node_info.capacity.unwrap() { - continue; - } - for (pos2, &qv) in q.iter().enumerate().skip(*pos) { - if partitions[qv].add(Some(rep + 1), n_zones, node_id, node_info) { - remaining -= 1; - *pos = pos2 + 1; - break; - } - } - } - } - if remaining == remaining0 { - // No progress made, exit - return None; - } - } - } - - Some(partitions) - } + /// This function calculates a new partition-to-node assignation. + /// The computed assignation maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignation, it minimizes the distance to + /// the former assignation (if any) to minimize the amount of + /// data to be moved. A heuristic ensures node triplets + /// dispersion (in garage_util::bipartite::optimize_matching()). + pub fn calculate_partition_assignation(&mut self) -> bool { + + //The nodes might have been updated, some might have been deleted. + //So we need to first update the list of nodes and retrieve the + //assignation. + let old_node_assignation = self.update_nodes_and_ring(); + + let (node_zone, _) = self.get_node_zone_capacity(); + + //We compute the optimal number of partition to assign to + //every node and zone. + if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions(){ + //We collect part_per_zone in a vec to not rely on the + //arbitrary order in which elements are iterated in + //Hashmap::iter() + let part_per_zone_vec = part_per_zone.iter() + .map(|(x,y)| (x.clone(),*y)) + .collect::>(); + //We create an indexing of the zones + let mut zone_id = HashMap::::new(); + for i in 0..part_per_zone_vec.len(){ + zone_id.insert(part_per_zone_vec[i].0.clone(), i); + } + + //We compute a candidate for the new partition to zone + //assignation. + let nb_zones = part_per_zone.len(); + let nb_nodes = part_per_nod.len(); + let nb_partitions = 1<> = + old_node_assignation.iter().map(|x| x.iter().map( + |id| match *id { Some(i) => zone_id[&node_zone[i]] , + None => no_zone } + ).collect()).collect(); + + //We minimize the distance to the former zone assignation + zone_assignation = optimize_matching( + &old_zone_assignation, &zone_assignation, nb_zones+1); //+1 for no_zone + + //We need to assign partitions to nodes in their zone + //We first put the nodes assignation that can stay the same + for i in 0..nb_partitions{ + for j in 0..self.replication_factor { + if let Some(Some(former_node)) = old_node_assignation[i].iter().find( + |x| if let Some(id) = x { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + } + else {false} + ) + { + if part_per_nod[*former_node] > 0 { + node_assignation[i][j] = Some(*former_node); + part_per_nod[*former_node] -= 1; + } + } + } + } + + + //We complete the assignation of partitions to nodes + let mut rng = rand::thread_rng(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if node_assignation[i][j] == None { + let possible_nodes : Vec = (0..nb_nodes) + .filter( + |id| zone_id[&node_zone[*id]] == zone_assignation[i][j] + && part_per_nod[*id] > 0).collect(); + assert!(possible_nodes.len()>0); + //We randomly pick a node + if let Some(nod) = possible_nodes.choose(&mut rng){ + node_assignation[i][j] = Some(*nod); + part_per_nod[*nod] -= 1; + } + } + } + } + + //We write the assignation in the 1D table + self.ring_assignation_data = Vec::::new(); + for i in 0..nb_partitions{ + for j in 0..self.replication_factor { + if let Some(id) = node_assignation[i][j] { + self.ring_assignation_data.push(id as CompactNodeType); + } + else {assert!(false)} + } + } + + true + } + else { false } + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignation given by ring, with the new indices of the nodes, and + /// None of the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignation + /// do modify assignation_ring and node_id_vec. + fn update_nodes_and_ring(&mut self) -> Vec>> { + let nb_partitions = 1usize< = self.roles.items().iter() + .map(|(k, _, _)| *k) + .collect(); + + if ring.len() == rf*nb_partitions { + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + node_assignation[i][j] = new_node_id_vec.iter() + .position(|id| *id == self.node_id_vec[ring[i*rf + j] as usize]); + } + } + } + + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = vec![]; + return node_assignation; + } + + ///This function compute the number of partition to assign to + ///every node and zone, so that every partition is replicated + ///self.replication_factor times and the capacity of a partition + ///is maximized. + fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { + + let mut zone_capacity :HashMap= HashMap::new(); + + let (node_zone, node_capacity) = self.get_node_zone_capacity(); + let nb_nodes = self.node_id_vec.len(); + + for i in 0..nb_nodes + { + if zone_capacity.contains_key(&node_zone[i]) { + zone_capacity.insert(node_zone[i].clone(), zone_capacity[&node_zone[i]] + node_capacity[i]); + } + else{ + zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); + } + } + + //Compute the optimal number of partitions per zone + let sum_capacities: u32 =zone_capacity.values().sum(); + + if sum_capacities <= 0 { + println!("No storage capacity in the network."); + return None; + } + + let nb_partitions = 1< = + zone_capacity.iter() + .map(|(k, v)| (k.clone(), min(nb_partitions, + (self.replication_factor*nb_partitions + **v as usize)/sum_capacities as usize) ) ).collect(); + + //The replication_factor-1 upper bounds the number of + //part_per_zones that are greater than nb_partitions + for _ in 1..self.replication_factor { + //The number of partitions that are not assignated to + //a zone that takes nb_partitions. + let sum_capleft : u32 = zone_capacity.keys() + .filter(| k | {part_per_zone[*k] < nb_partitions} ) + .map(|k| zone_capacity[k]).sum(); + + //The number of replication of the data that we need + //to ensure. + let repl_left = self.replication_factor + - part_per_zone.values() + .filter(|x| {**x == nb_partitions}) + .count(); + if repl_left == 0 { + break; + } + + for k in zone_capacity.keys() { + if part_per_zone[k] != nb_partitions + { + part_per_zone.insert(k.to_string() , min(nb_partitions, + (nb_partitions*zone_capacity[k] as usize + *repl_left)/sum_capleft as usize)); + } + } + } + + //Now we divide the zone's partition share proportionally + //between their nodes. + + let mut part_per_nod : Vec = (0..nb_nodes).map( + |i| (part_per_zone[&node_zone[i]]*node_capacity[i] as usize)/zone_capacity[&node_zone[i]] as usize + ) + .collect(); + + //We must update the part_per_zone to make it correspond to + //part_per_nod (because of integer rounding) + part_per_zone = part_per_zone.iter().map(|(k,_)| + (k.clone(), 0)) + .collect(); + for i in 0..nb_nodes { + part_per_zone.insert( + node_zone[i].clone() , + part_per_zone[&node_zone[i]] + part_per_nod[i]); + } + + //Because of integer rounding, the total sum of part_per_nod + //might not be replication_factor*nb_partitions. + // We need at most to add 1 to every non maximal value of + // part_per_nod. The capacity of a partition will be bounded + // by the minimal value of + // node_capacity_vec[i]/part_per_nod[i] + // so we try to maximize this minimal value, keeping the + // part_per_zone capped + + let discrepancy : usize = + nb_partitions*self.replication_factor + - part_per_nod.iter().sum::(); + + //We use a stupid O(N^2) algorithm. If the number of nodes + //is actually expected to be high, one should optimize this. + + for _ in 0..discrepancy { + if let Some(idmax) = (0..nb_nodes) + .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) + .max_by( |i,j| + (node_capacity[*i]*(part_per_nod[*j]+1) as u32) + .cmp(&(node_capacity[*j]*(part_per_nod[*i]+1) as u32)) + ) + { + part_per_nod[idmax] += 1; + part_per_zone.insert(node_zone[idmax].clone(),part_per_zone[&node_zone[idmax]]+1); + } + } + + //We check the algorithm consistency + + let discrepancy : usize = + nb_partitions*self.replication_factor + - part_per_nod.iter().sum::(); + assert!(discrepancy == 0); + assert!(if let Some(v) = part_per_zone.values().max() + {*v <= nb_partitions} else {false} ); + + Some((part_per_nod, part_per_zone)) + } + + + //Returns vectors of zone and capacity; indexed by the same (temporary) + //indices as node_id_vec. + fn get_node_zone_capacity(& self) -> (Vec , Vec) { + + let node_zone = self.node_id_vec.iter().map( + |id_nod| match self.node_role(id_nod) { + Some(NodeRole{zone,capacity:_,tags:_}) => zone.clone() , + _ => "".to_string() + } + ).collect(); + + let node_capacity = self.node_id_vec.iter().map( + |id_nod| match self.node_role(id_nod) { + Some(NodeRole{zone:_,capacity,tags:_}) => + if let Some(c)=capacity + {*c} + else {0}, + _ => 0 + } + ).collect(); + + (node_zone,node_capacity) + } - fn configured_nodes_and_zones(&self) -> (Vec<(&Uuid, &NodeRole)>, HashSet<&str>) { - let configured_nodes = self - .roles - .items() - .iter() - .filter(|(_id, _, info)| info.0.is_some()) - .map(|(id, _, info)| (id, info.0.as_ref().unwrap())) - .collect::>(); - - let zones = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(_id, info)| info.zone.as_str()) - .collect::>(); - - (configured_nodes, zones) - } - - fn compute_assignation_data<'a>( - &self, - configured_nodes: &[(&'a Uuid, &'a NodeRole)], - partitions: &[PartitionAss<'a>], - ) -> (Vec, Vec) { - assert!(partitions.len() == (1 << PARTITION_BITS)); - - // Make a canonical order for nodes - let mut nodes = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(id, _)| **id) - .collect::>(); - let nodes_rev = nodes - .iter() - .enumerate() - .map(|(i, id)| (*id, i as CompactNodeType)) - .collect::>(); - - let mut assignation_data = vec![]; - for partition in partitions.iter() { - assert!(partition.nodes.len() == self.replication_factor); - for (id, _) in partition.nodes.iter() { - assignation_data.push(*nodes_rev.get(id).unwrap()); - } - } - - nodes.extend( - configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_none()) - .map(|(id, _)| **id), - ); - - (nodes, assignation_data) - } - - fn parse_assignation_data(&self) -> Vec> { - if self.ring_assignation_data.len() == self.replication_factor * (1 << PARTITION_BITS) { - // If the previous assignation data is correct, use that - let mut partitions = vec![]; - for i in 0..(1 << PARTITION_BITS) { - let mut part = PartitionAss::new(); - for node_i in self.ring_assignation_data - [i * self.replication_factor..(i + 1) * self.replication_factor] - .iter() - { - let node_id = &self.node_id_vec[*node_i as usize]; - - if let Some(NodeRoleV(Some(info))) = self.roles.get(node_id) { - part.nodes.push((node_id, Some(info))); - } else { - part.nodes.push((node_id, None)); - } - } - partitions.push(part); - } - partitions - } else { - // Otherwise start fresh - (0..(1 << PARTITION_BITS)) - .map(|_| PartitionAss::new()) - .collect() - } - } - - fn partitions_per_node<'a>(&self, partitions: &[PartitionAss<'a>]) -> HashMap<&'a Uuid, usize> { - let mut partitions_per_node = HashMap::<&Uuid, usize>::new(); - for p in partitions.iter() { - for (id, _) in p.nodes.iter() { - *partitions_per_node.entry(*id).or_insert(0) += 1; - } - } - partitions_per_node - } -} - -// ---- Internal structs for partition assignation in layout ---- - -#[derive(Clone)] -struct PartitionAss<'a> { - nodes: Vec<(&'a Uuid, Option<&'a NodeRole>)>, } -impl<'a> PartitionAss<'a> { - fn new() -> Self { - Self { nodes: Vec::new() } - } - fn nplus(&self, other: &PartitionAss<'a>) -> usize { - self.nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .count() - } - fn txtplus(&self, other: &PartitionAss<'a>) -> String { - let mut nodes = self - .nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .map(|x| format!("{:?}", x.0)) - .collect::>(); - nodes.sort(); - if self.nodes.iter().any(|x| other.nodes.contains(x)) { - nodes.push("...".into()); - } - format!("[{}]", nodes.join(" ")) - } +#[cfg(test)] +mod tests { + use super::*; + use itertools::Itertools; + + fn check_assignation(cl : &ClusterLayout) { + + //Check that input data has the right format + let nb_partitions = 1usize<>(); + + let zone_vec = node_zone.iter().unique().collect::>(); + let zone_nb_part = zone_vec.iter().map( |z| cl.ring_assignation_data.iter() + .filter(|x| node_zone[**x as usize] == **z) + .count() + ).collect::>(); + + //Check optimality of the zone assignation : would it be better for the + //node_capacity/node_partitions ratio to change the assignation of a partition + + if let Some(idmin) = (0..nb_nodes).min_by( + |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) + ){ + if let Some(idnew) = (0..nb_nodes) + .filter( |i| if let Some(p) = zone_vec.iter().position(|z| **z==node_zone[*i]) + {zone_nb_part[p] < nb_partitions } + else { false }) + .max_by( + |i,j| (node_capacity[*i]*(node_nb_part[*j]as u32+1)) + .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) + ){ + assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= + node_capacity[idnew]*node_nb_part[idmin] as u32); + } + + } + + //In every zone, check optimality of the nod assignation + for z in zone_vec { + let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z ); + if let Some(idmin) = node_of_z_iter.clone().min_by( + |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) + ){ + if let Some(idnew) = node_of_z_iter.min_by( + |i,j| (node_capacity[*i]*(node_nb_part[*j] as u32+1)) + .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) + ){ + assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= + node_capacity[idnew]*node_nb_part[idmin] as u32); + } + } + } + + } + + fn update_layout(cl : &mut ClusterLayout, node_id_vec : &Vec, + node_capacity_vec : &Vec , node_zone_vec : &Vec) { + for i in 0..node_id_vec.len(){ + if let Some(x) = FixedBytes32::try_from(&[i as u8;32]) { + cl.node_id_vec.push(x); + } + + let update = cl.roles.update_mutator(cl.node_id_vec[i] , + NodeRoleV(Some(NodeRole{ + zone : (node_zone_vec[i].to_string()), + capacity : (Some(node_capacity_vec[i])), + tags : (vec![])}))); + cl.roles.merge(&update); + } + } + + #[test] + fn test_assignation() { + + let mut node_id_vec = vec![1,2,3]; + let mut node_capacity_vec = vec![4000,1000,2000]; + let mut node_zone_vec= vec!["A", "B", "C"].into_iter().map(|x| x.to_string()).collect(); + + let mut cl = ClusterLayout { + node_id_vec: vec![], + + roles : LwwMap::new(), + + replication_factor: 3, + ring_assignation_data : vec![], + version:0, + staging: LwwMap::new(), + staging_hash: sha256sum(&[1;32]), + }; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_id_vec = vec![1,2,3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000,1000,1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec= vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"].into_iter().map(|x| x.to_string()).collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000,1000,2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + + node_capacity_vec = vec![4000,4000,2000, 7000, 1000, 9000, 2000, 10, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + } +} - fn is_valid_transition_to(&self, other: &PartitionAss<'a>, replication_factor: usize) -> bool { - let min_keep_nodes_per_part = (replication_factor + 1) / 2; - let n_removed = self.nplus(other); - if self.nodes.len() <= min_keep_nodes_per_part { - n_removed == 0 - } else { - n_removed <= self.nodes.len() - min_keep_nodes_per_part - } - } - // add is a key function in creating a PartitionAss, i.e. the list of nodes - // to which a partition is assigned. It tries to add a certain node id to the - // assignation, but checks that doing so is compatible with the NECESSARY - // condition that the partition assignation must be dispersed over different - // zones (datacenters) if enough zones exist. This is why it takes a n_zones - // parameter, which is the total number of zones that have existing nodes: - // if nodes in the assignation already cover all n_zones zones, then any node - // that is not yet in the assignation can be added. Otherwise, only nodes - // that are in a new zone can be added. - fn add( - &mut self, - target_len: Option, - n_zones: usize, - node: &'a Uuid, - role: &'a NodeRole, - ) -> bool { - if let Some(tl) = target_len { - if self.nodes.len() != tl - 1 { - return false; - } - } - - let p_zns = self - .nodes - .iter() - .map(|(_id, info)| info.unwrap().zone.as_str()) - .collect::>(); - if (p_zns.len() < n_zones && !p_zns.contains(&role.zone.as_str())) - || (p_zns.len() == n_zones && !self.nodes.iter().any(|(id, _)| *id == node)) - { - self.nodes.push((node, Some(role))); - true - } else { - false - } - } -} -- cgit v1.2.3 From 2aeaddd5e2e1911b084f6d49ccb2236b7fec31af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 09:57:05 +0200 Subject: Apply cargo fmt --- src/rpc/layout.rs | 940 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 496 insertions(+), 444 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index afd7df17..ac31da72 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,12 +1,12 @@ +use std::cmp::min; use std::cmp::Ordering; -use std::cmp::{min}; -use std::collections::{HashMap}; +use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use garage_util::bipartite::*; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; -use garage_util::bipartite::*; use rand::prelude::SliceRandom; @@ -168,454 +168,506 @@ impl ClusterLayout { true } + /// This function calculates a new partition-to-node assignation. + /// The computed assignation maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignation, it minimizes the distance to + /// the former assignation (if any) to minimize the amount of + /// data to be moved. A heuristic ensures node triplets + /// dispersion (in garage_util::bipartite::optimize_matching()). + pub fn calculate_partition_assignation(&mut self) -> bool { + //The nodes might have been updated, some might have been deleted. + //So we need to first update the list of nodes and retrieve the + //assignation. + let old_node_assignation = self.update_nodes_and_ring(); + + let (node_zone, _) = self.get_node_zone_capacity(); + + //We compute the optimal number of partition to assign to + //every node and zone. + if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions() { + //We collect part_per_zone in a vec to not rely on the + //arbitrary order in which elements are iterated in + //Hashmap::iter() + let part_per_zone_vec = part_per_zone + .iter() + .map(|(x, y)| (x.clone(), *y)) + .collect::>(); + //We create an indexing of the zones + let mut zone_id = HashMap::::new(); + for i in 0..part_per_zone_vec.len() { + zone_id.insert(part_per_zone_vec[i].0.clone(), i); + } - /// This function calculates a new partition-to-node assignation. - /// The computed assignation maximizes the capacity of a - /// partition (assuming all partitions have the same size). - /// Among such optimal assignation, it minimizes the distance to - /// the former assignation (if any) to minimize the amount of - /// data to be moved. A heuristic ensures node triplets - /// dispersion (in garage_util::bipartite::optimize_matching()). - pub fn calculate_partition_assignation(&mut self) -> bool { - - //The nodes might have been updated, some might have been deleted. - //So we need to first update the list of nodes and retrieve the - //assignation. - let old_node_assignation = self.update_nodes_and_ring(); - - let (node_zone, _) = self.get_node_zone_capacity(); - - //We compute the optimal number of partition to assign to - //every node and zone. - if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions(){ - //We collect part_per_zone in a vec to not rely on the - //arbitrary order in which elements are iterated in - //Hashmap::iter() - let part_per_zone_vec = part_per_zone.iter() - .map(|(x,y)| (x.clone(),*y)) - .collect::>(); - //We create an indexing of the zones - let mut zone_id = HashMap::::new(); - for i in 0..part_per_zone_vec.len(){ - zone_id.insert(part_per_zone_vec[i].0.clone(), i); - } - - //We compute a candidate for the new partition to zone - //assignation. - let nb_zones = part_per_zone.len(); - let nb_nodes = part_per_nod.len(); - let nb_partitions = 1<> = - old_node_assignation.iter().map(|x| x.iter().map( - |id| match *id { Some(i) => zone_id[&node_zone[i]] , - None => no_zone } - ).collect()).collect(); - - //We minimize the distance to the former zone assignation - zone_assignation = optimize_matching( - &old_zone_assignation, &zone_assignation, nb_zones+1); //+1 for no_zone - - //We need to assign partitions to nodes in their zone - //We first put the nodes assignation that can stay the same - for i in 0..nb_partitions{ - for j in 0..self.replication_factor { - if let Some(Some(former_node)) = old_node_assignation[i].iter().find( - |x| if let Some(id) = x { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - } - else {false} - ) - { - if part_per_nod[*former_node] > 0 { - node_assignation[i][j] = Some(*former_node); - part_per_nod[*former_node] -= 1; - } - } - } - } - - - //We complete the assignation of partitions to nodes - let mut rng = rand::thread_rng(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if node_assignation[i][j] == None { - let possible_nodes : Vec = (0..nb_nodes) - .filter( - |id| zone_id[&node_zone[*id]] == zone_assignation[i][j] - && part_per_nod[*id] > 0).collect(); - assert!(possible_nodes.len()>0); - //We randomly pick a node - if let Some(nod) = possible_nodes.choose(&mut rng){ - node_assignation[i][j] = Some(*nod); - part_per_nod[*nod] -= 1; - } - } - } - } - - //We write the assignation in the 1D table - self.ring_assignation_data = Vec::::new(); - for i in 0..nb_partitions{ - for j in 0..self.replication_factor { - if let Some(id) = node_assignation[i][j] { - self.ring_assignation_data.push(id as CompactNodeType); - } - else {assert!(false)} - } - } - - true - } - else { false } - } - - /// The LwwMap of node roles might have changed. This function updates the node_id_vec - /// and returns the assignation given by ring, with the new indices of the nodes, and - /// None of the node is not present anymore. - /// We work with the assumption that only this function and calculate_new_assignation - /// do modify assignation_ring and node_id_vec. - fn update_nodes_and_ring(&mut self) -> Vec>> { - let nb_partitions = 1usize< = self.roles.items().iter() - .map(|(k, _, _)| *k) - .collect(); - - if ring.len() == rf*nb_partitions { - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - node_assignation[i][j] = new_node_id_vec.iter() - .position(|id| *id == self.node_id_vec[ring[i*rf + j] as usize]); - } - } - } - - self.node_id_vec = new_node_id_vec; - self.ring_assignation_data = vec![]; - return node_assignation; - } - - ///This function compute the number of partition to assign to - ///every node and zone, so that every partition is replicated - ///self.replication_factor times and the capacity of a partition - ///is maximized. - fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { - - let mut zone_capacity :HashMap= HashMap::new(); - - let (node_zone, node_capacity) = self.get_node_zone_capacity(); - let nb_nodes = self.node_id_vec.len(); - - for i in 0..nb_nodes - { - if zone_capacity.contains_key(&node_zone[i]) { - zone_capacity.insert(node_zone[i].clone(), zone_capacity[&node_zone[i]] + node_capacity[i]); - } - else{ - zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); - } - } - - //Compute the optimal number of partitions per zone - let sum_capacities: u32 =zone_capacity.values().sum(); - - if sum_capacities <= 0 { - println!("No storage capacity in the network."); - return None; - } - - let nb_partitions = 1< = - zone_capacity.iter() - .map(|(k, v)| (k.clone(), min(nb_partitions, - (self.replication_factor*nb_partitions - **v as usize)/sum_capacities as usize) ) ).collect(); - - //The replication_factor-1 upper bounds the number of - //part_per_zones that are greater than nb_partitions - for _ in 1..self.replication_factor { - //The number of partitions that are not assignated to - //a zone that takes nb_partitions. - let sum_capleft : u32 = zone_capacity.keys() - .filter(| k | {part_per_zone[*k] < nb_partitions} ) - .map(|k| zone_capacity[k]).sum(); - - //The number of replication of the data that we need - //to ensure. - let repl_left = self.replication_factor - - part_per_zone.values() - .filter(|x| {**x == nb_partitions}) - .count(); - if repl_left == 0 { - break; - } - - for k in zone_capacity.keys() { - if part_per_zone[k] != nb_partitions - { - part_per_zone.insert(k.to_string() , min(nb_partitions, - (nb_partitions*zone_capacity[k] as usize - *repl_left)/sum_capleft as usize)); - } - } - } - - //Now we divide the zone's partition share proportionally - //between their nodes. - - let mut part_per_nod : Vec = (0..nb_nodes).map( - |i| (part_per_zone[&node_zone[i]]*node_capacity[i] as usize)/zone_capacity[&node_zone[i]] as usize - ) - .collect(); - - //We must update the part_per_zone to make it correspond to - //part_per_nod (because of integer rounding) - part_per_zone = part_per_zone.iter().map(|(k,_)| - (k.clone(), 0)) - .collect(); - for i in 0..nb_nodes { - part_per_zone.insert( - node_zone[i].clone() , - part_per_zone[&node_zone[i]] + part_per_nod[i]); - } - - //Because of integer rounding, the total sum of part_per_nod - //might not be replication_factor*nb_partitions. - // We need at most to add 1 to every non maximal value of - // part_per_nod. The capacity of a partition will be bounded - // by the minimal value of - // node_capacity_vec[i]/part_per_nod[i] - // so we try to maximize this minimal value, keeping the - // part_per_zone capped - - let discrepancy : usize = - nb_partitions*self.replication_factor - - part_per_nod.iter().sum::(); - - //We use a stupid O(N^2) algorithm. If the number of nodes - //is actually expected to be high, one should optimize this. - - for _ in 0..discrepancy { - if let Some(idmax) = (0..nb_nodes) - .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) - .max_by( |i,j| - (node_capacity[*i]*(part_per_nod[*j]+1) as u32) - .cmp(&(node_capacity[*j]*(part_per_nod[*i]+1) as u32)) - ) - { - part_per_nod[idmax] += 1; - part_per_zone.insert(node_zone[idmax].clone(),part_per_zone[&node_zone[idmax]]+1); - } - } - - //We check the algorithm consistency - - let discrepancy : usize = - nb_partitions*self.replication_factor - - part_per_nod.iter().sum::(); - assert!(discrepancy == 0); - assert!(if let Some(v) = part_per_zone.values().max() - {*v <= nb_partitions} else {false} ); - - Some((part_per_nod, part_per_zone)) - } - - - //Returns vectors of zone and capacity; indexed by the same (temporary) - //indices as node_id_vec. - fn get_node_zone_capacity(& self) -> (Vec , Vec) { - - let node_zone = self.node_id_vec.iter().map( - |id_nod| match self.node_role(id_nod) { - Some(NodeRole{zone,capacity:_,tags:_}) => zone.clone() , - _ => "".to_string() - } - ).collect(); - - let node_capacity = self.node_id_vec.iter().map( - |id_nod| match self.node_role(id_nod) { - Some(NodeRole{zone:_,capacity,tags:_}) => - if let Some(c)=capacity - {*c} - else {0}, - _ => 0 - } - ).collect(); - - (node_zone,node_capacity) - } + //We compute a candidate for the new partition to zone + //assignation. + let nb_zones = part_per_zone.len(); + let nb_nodes = part_per_nod.len(); + let nb_partitions = 1 << PARTITION_BITS; + let left_cap_vec = vec![self.replication_factor as u32; nb_partitions]; + let right_cap_vec = part_per_zone_vec.iter().map(|(_, y)| *y as u32).collect(); + let mut zone_assignation = dinic_compute_matching(left_cap_vec, right_cap_vec); + + //We create the structure for the partition-to-node assignation. + let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; + //We will decrement part_per_nod to keep track of the number + //of partitions that we still have to associate. + let mut part_per_nod = part_per_nod.clone(); + + //We minimize the distance to the former assignation(if any) + + //We get the id of the zones of the former assignation + //(and the id no_zone if there is no node assignated) + let no_zone = part_per_zone_vec.len(); + let old_zone_assignation: Vec> = old_node_assignation + .iter() + .map(|x| { + x.iter() + .map(|id| match *id { + Some(i) => zone_id[&node_zone[i]], + None => no_zone, + }) + .collect() + }) + .collect(); + + //We minimize the distance to the former zone assignation + zone_assignation = + optimize_matching(&old_zone_assignation, &zone_assignation, nb_zones + 1); //+1 for no_zone + + //We need to assign partitions to nodes in their zone + //We first put the nodes assignation that can stay the same + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if let Some(Some(former_node)) = old_node_assignation[i].iter().find(|x| { + if let Some(id) = x { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + } else { + false + } + }) { + if part_per_nod[*former_node] > 0 { + node_assignation[i][j] = Some(*former_node); + part_per_nod[*former_node] -= 1; + } + } + } + } -} + //We complete the assignation of partitions to nodes + let mut rng = rand::thread_rng(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if node_assignation[i][j] == None { + let possible_nodes: Vec = (0..nb_nodes) + .filter(|id| { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + && part_per_nod[*id] > 0 + }) + .collect(); + assert!(possible_nodes.len() > 0); + //We randomly pick a node + if let Some(nod) = possible_nodes.choose(&mut rng) { + node_assignation[i][j] = Some(*nod); + part_per_nod[*nod] -= 1; + } + } + } + } + + //We write the assignation in the 1D table + self.ring_assignation_data = Vec::::new(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if let Some(id) = node_assignation[i][j] { + self.ring_assignation_data.push(id as CompactNodeType); + } else { + assert!(false) + } + } + } + true + } else { + false + } + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignation given by ring, with the new indices of the nodes, and + /// None of the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignation + /// do modify assignation_ring and node_id_vec. + fn update_nodes_and_ring(&mut self) -> Vec>> { + let nb_partitions = 1usize << PARTITION_BITS; + let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; + let rf = self.replication_factor; + let ring = &self.ring_assignation_data; + + let new_node_id_vec: Vec = self.roles.items().iter().map(|(k, _, _)| *k).collect(); + + if ring.len() == rf * nb_partitions { + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + node_assignation[i][j] = new_node_id_vec + .iter() + .position(|id| *id == self.node_id_vec[ring[i * rf + j] as usize]); + } + } + } + + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = vec![]; + return node_assignation; + } + + ///This function compute the number of partition to assign to + ///every node and zone, so that every partition is replicated + ///self.replication_factor times and the capacity of a partition + ///is maximized. + fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { + let mut zone_capacity: HashMap = HashMap::new(); + + let (node_zone, node_capacity) = self.get_node_zone_capacity(); + let nb_nodes = self.node_id_vec.len(); + + for i in 0..nb_nodes { + if zone_capacity.contains_key(&node_zone[i]) { + zone_capacity.insert( + node_zone[i].clone(), + zone_capacity[&node_zone[i]] + node_capacity[i], + ); + } else { + zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); + } + } + + //Compute the optimal number of partitions per zone + let sum_capacities: u32 = zone_capacity.values().sum(); + + if sum_capacities <= 0 { + println!("No storage capacity in the network."); + return None; + } + + let nb_partitions = 1 << PARTITION_BITS; + + //Initially we would like to use zones porportionally to + //their capacity. + //However, a large zone can be associated to at most + //nb_partitions to ensure replication of the date. + //So we take the min with nb_partitions: + let mut part_per_zone: HashMap = zone_capacity + .iter() + .map(|(k, v)| { + ( + k.clone(), + min( + nb_partitions, + (self.replication_factor * nb_partitions * *v as usize) + / sum_capacities as usize, + ), + ) + }) + .collect(); + + //The replication_factor-1 upper bounds the number of + //part_per_zones that are greater than nb_partitions + for _ in 1..self.replication_factor { + //The number of partitions that are not assignated to + //a zone that takes nb_partitions. + let sum_capleft: u32 = zone_capacity + .keys() + .filter(|k| part_per_zone[*k] < nb_partitions) + .map(|k| zone_capacity[k]) + .sum(); + + //The number of replication of the data that we need + //to ensure. + let repl_left = self.replication_factor + - part_per_zone + .values() + .filter(|x| **x == nb_partitions) + .count(); + if repl_left == 0 { + break; + } + + for k in zone_capacity.keys() { + if part_per_zone[k] != nb_partitions { + part_per_zone.insert( + k.to_string(), + min( + nb_partitions, + (nb_partitions * zone_capacity[k] as usize * repl_left) + / sum_capleft as usize, + ), + ); + } + } + } + + //Now we divide the zone's partition share proportionally + //between their nodes. + + let mut part_per_nod: Vec = (0..nb_nodes) + .map(|i| { + (part_per_zone[&node_zone[i]] * node_capacity[i] as usize) + / zone_capacity[&node_zone[i]] as usize + }) + .collect(); + + //We must update the part_per_zone to make it correspond to + //part_per_nod (because of integer rounding) + part_per_zone = part_per_zone.iter().map(|(k, _)| (k.clone(), 0)).collect(); + for i in 0..nb_nodes { + part_per_zone.insert( + node_zone[i].clone(), + part_per_zone[&node_zone[i]] + part_per_nod[i], + ); + } + + //Because of integer rounding, the total sum of part_per_nod + //might not be replication_factor*nb_partitions. + // We need at most to add 1 to every non maximal value of + // part_per_nod. The capacity of a partition will be bounded + // by the minimal value of + // node_capacity_vec[i]/part_per_nod[i] + // so we try to maximize this minimal value, keeping the + // part_per_zone capped + + let discrepancy: usize = + nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); + + //We use a stupid O(N^2) algorithm. If the number of nodes + //is actually expected to be high, one should optimize this. + + for _ in 0..discrepancy { + if let Some(idmax) = (0..nb_nodes) + .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) + .max_by(|i, j| { + (node_capacity[*i] * (part_per_nod[*j] + 1) as u32) + .cmp(&(node_capacity[*j] * (part_per_nod[*i] + 1) as u32)) + }) { + part_per_nod[idmax] += 1; + part_per_zone.insert( + node_zone[idmax].clone(), + part_per_zone[&node_zone[idmax]] + 1, + ); + } + } + //We check the algorithm consistency + + let discrepancy: usize = + nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); + assert!(discrepancy == 0); + assert!(if let Some(v) = part_per_zone.values().max() { + *v <= nb_partitions + } else { + false + }); + + Some((part_per_nod, part_per_zone)) + } + + //Returns vectors of zone and capacity; indexed by the same (temporary) + //indices as node_id_vec. + fn get_node_zone_capacity(&self) -> (Vec, Vec) { + let node_zone = self + .node_id_vec + .iter() + .map(|id_nod| match self.node_role(id_nod) { + Some(NodeRole { + zone, + capacity: _, + tags: _, + }) => zone.clone(), + _ => "".to_string(), + }) + .collect(); + + let node_capacity = self + .node_id_vec + .iter() + .map(|id_nod| match self.node_role(id_nod) { + Some(NodeRole { + zone: _, + capacity, + tags: _, + }) => { + if let Some(c) = capacity { + *c + } else { + 0 + } + } + _ => 0, + }) + .collect(); + + (node_zone, node_capacity) + } +} #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; - - fn check_assignation(cl : &ClusterLayout) { - - //Check that input data has the right format - let nb_partitions = 1usize<>(); - - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec.iter().map( |z| cl.ring_assignation_data.iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - ).collect::>(); - - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition - - if let Some(idmin) = (0..nb_nodes).min_by( - |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) - ){ - if let Some(idnew) = (0..nb_nodes) - .filter( |i| if let Some(p) = zone_vec.iter().position(|z| **z==node_zone[*i]) - {zone_nb_part[p] < nb_partitions } - else { false }) - .max_by( - |i,j| (node_capacity[*i]*(node_nb_part[*j]as u32+1)) - .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) - ){ - assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= - node_capacity[idnew]*node_nb_part[idmin] as u32); - } - - } - - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z ); - if let Some(idmin) = node_of_z_iter.clone().min_by( - |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) - ){ - if let Some(idnew) = node_of_z_iter.min_by( - |i,j| (node_capacity[*i]*(node_nb_part[*j] as u32+1)) - .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) - ){ - assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= - node_capacity[idnew]*node_nb_part[idmin] as u32); - } - } - } - - } - - fn update_layout(cl : &mut ClusterLayout, node_id_vec : &Vec, - node_capacity_vec : &Vec , node_zone_vec : &Vec) { - for i in 0..node_id_vec.len(){ - if let Some(x) = FixedBytes32::try_from(&[i as u8;32]) { - cl.node_id_vec.push(x); - } - - let update = cl.roles.update_mutator(cl.node_id_vec[i] , - NodeRoleV(Some(NodeRole{ - zone : (node_zone_vec[i].to_string()), - capacity : (Some(node_capacity_vec[i])), - tags : (vec![])}))); - cl.roles.merge(&update); - } - } - - #[test] - fn test_assignation() { - - let mut node_id_vec = vec![1,2,3]; - let mut node_capacity_vec = vec![4000,1000,2000]; - let mut node_zone_vec= vec!["A", "B", "C"].into_iter().map(|x| x.to_string()).collect(); - - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles : LwwMap::new(), - - replication_factor: 3, - ring_assignation_data : vec![], - version:0, - staging: LwwMap::new(), - staging_hash: sha256sum(&[1;32]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - node_id_vec = vec![1,2,3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000,1000,1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec= vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"].into_iter().map(|x| x.to_string()).collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - node_capacity_vec = vec![4000,1000,2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - - node_capacity_vec = vec![4000,4000,2000, 7000, 1000, 9000, 2000, 10, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - } -} + use super::*; + use itertools::Itertools; + + fn check_assignation(cl: &ClusterLayout) { + //Check that input data has the right format + let nb_partitions = 1usize << PARTITION_BITS; + assert!([1, 2, 3].contains(&cl.replication_factor)); + assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); + + let (node_zone, node_capacity) = cl.get_node_zone_capacity(); + + //Check that is is a correct assignation with zone redundancy + let rf = cl.replication_factor; + for i in 0..nb_partitions { + assert!( + rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] + .iter() + .map(|nod| node_zone[*nod as usize].clone()) + .unique() + .count() + ); + } + + let nb_nodes = cl.node_id_vec.len(); + //Check optimality + let node_nb_part = (0..nb_nodes) + .map(|i| { + cl.ring_assignation_data + .iter() + .filter(|x| **x == i as u8) + .count() + }) + .collect::>(); + + let zone_vec = node_zone.iter().unique().collect::>(); + let zone_nb_part = zone_vec + .iter() + .map(|z| { + cl.ring_assignation_data + .iter() + .filter(|x| node_zone[**x as usize] == **z) + .count() + }) + .collect::>(); + + //Check optimality of the zone assignation : would it be better for the + //node_capacity/node_partitions ratio to change the assignation of a partition + + if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { + (node_capacity[*i] * node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) + }) { + if let Some(idnew) = (0..nb_nodes) + .filter(|i| { + if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { + zone_nb_part[p] < nb_partitions + } else { + false + } + }) + .max_by(|i, j| { + (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) + .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) + }) { + assert!( + node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) + >= node_capacity[idnew] * node_nb_part[idmin] as u32 + ); + } + } + + //In every zone, check optimality of the nod assignation + for z in zone_vec { + let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); + if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { + (node_capacity[*i] * node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) + }) { + if let Some(idnew) = node_of_z_iter.min_by(|i, j| { + (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) + .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) + }) { + assert!( + node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) + >= node_capacity[idnew] * node_nb_part[idmin] as u32 + ); + } + } + } + } + + fn update_layout( + cl: &mut ClusterLayout, + node_id_vec: &Vec, + node_capacity_vec: &Vec, + node_zone_vec: &Vec, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); + } + let update = cl.roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.roles.merge(&update); + } + } + + #[test] + fn test_assignation() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = ClusterLayout { + node_id_vec: vec![], + roles: LwwMap::new(), + replication_factor: 3, + ring_assignation_data: vec![], + version: 0, + staging: LwwMap::new(), + staging_hash: sha256sum(&[1; 32]), + }; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000, 4000, 2000, 7000, 1000, 9000, 2000, 10, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + } +} -- cgit v1.2.3 From 948ff93cf10da1705766c2f0d256c316adcb806b Mon Sep 17 00:00:00 2001 From: Mendes Date: Sun, 1 May 2022 16:05:39 +0200 Subject: Corrected the warnings and errors issued by cargo clippy --- src/rpc/layout.rs | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index ac31da72..d0ee3463 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -195,8 +195,8 @@ impl ClusterLayout { .collect::>(); //We create an indexing of the zones let mut zone_id = HashMap::::new(); - for i in 0..part_per_zone_vec.len() { - zone_id.insert(part_per_zone_vec[i].0.clone(), i); + for (i, ppz) in part_per_zone_vec.iter().enumerate() { + zone_id.insert(ppz.0.clone(), i); } //We compute a candidate for the new partition to zone @@ -212,7 +212,7 @@ impl ClusterLayout { let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; //We will decrement part_per_nod to keep track of the number //of partitions that we still have to associate. - let mut part_per_nod = part_per_nod.clone(); + let mut part_per_nod = part_per_nod; //We minimize the distance to the former assignation(if any) @@ -265,7 +265,7 @@ impl ClusterLayout { && part_per_nod[*id] > 0 }) .collect(); - assert!(possible_nodes.len() > 0); + assert!(!possible_nodes.is_empty()); //We randomly pick a node if let Some(nod) = possible_nodes.choose(&mut rng) { node_assignation[i][j] = Some(*nod); @@ -277,12 +277,12 @@ impl ClusterLayout { //We write the assignation in the 1D table self.ring_assignation_data = Vec::::new(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if let Some(id) = node_assignation[i][j] { + for ass in node_assignation { + for nod in ass { + if let Some(id) = nod { self.ring_assignation_data.push(id as CompactNodeType); } else { - assert!(false) + panic!() } } } @@ -318,7 +318,7 @@ impl ClusterLayout { self.node_id_vec = new_node_id_vec; self.ring_assignation_data = vec![]; - return node_assignation; + node_assignation } ///This function compute the number of partition to assign to @@ -345,7 +345,7 @@ impl ClusterLayout { //Compute the optimal number of partitions per zone let sum_capacities: u32 = zone_capacity.values().sum(); - if sum_capacities <= 0 { + if sum_capacities == 0 { println!("No storage capacity in the network."); return None; } @@ -493,14 +493,10 @@ impl ClusterLayout { .map(|id_nod| match self.node_role(id_nod) { Some(NodeRole { zone: _, - capacity, + capacity: Some(c), tags: _, }) => { - if let Some(c) = capacity { *c - } else { - 0 - } } _ => 0, }) -- cgit v1.2.3 From 617f28bfa466d52fac7244f08b3a036ab4e8c9af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 5 May 2022 14:21:57 +0200 Subject: Correct small formatting issue --- src/rpc/layout.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d0ee3463..40f97368 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -495,9 +495,7 @@ impl ClusterLayout { zone: _, capacity: Some(c), tags: _, - }) => { - *c - } + }) => *c, _ => 0, }) .collect(); -- cgit v1.2.3 From 7f3249a23770fd4da981c2ecb1126da97e9b4ca5 Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 21 Sep 2022 14:39:59 +0200 Subject: New version of the algorithm that calculate the layout. It takes as paramters the replication factor and the zone redundancy, computes the largest partition size reachable with these constraints, and among the possible assignation with this partition size, it computes the one that moves the least number of partitions compared to the previous assignation. This computation uses graph algorithms defined in graph_algo.rs --- src/rpc/layout.rs | 795 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 471 insertions(+), 324 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 40f97368..ff60ce98 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,17 +1,23 @@ -use std::cmp::min; use std::cmp::Ordering; use std::collections::HashMap; +use std::collections::HashSet; + +use hex::ToHex; use serde::{Deserialize, Serialize}; -use garage_util::bipartite::*; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; -use rand::prelude::SliceRandom; +use crate::graph_algo::*; use crate::ring::*; +use std::convert::TryInto; + +//The Message type will be used to collect information on the algorithm. +type Message = Vec; + /// The layout of the cluster, i.e. the list of roles /// which are assigned to each cluster node #[derive(Clone, Debug, Serialize, Deserialize)] @@ -19,12 +25,21 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, + #[serde(default="default_one")] + pub zone_redundancy: usize, + + //This attribute is only used to retain the previously computed partition size, + //to know to what extent does it change with the layout update. + #[serde(default="default_zero")] + pub partition_size: u32, + pub roles: LwwMap, /// node_id_vec: a vector of node IDs with a role assigned /// in the system (this includes gateway nodes). /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers + /// 1. non-gateway nodes are first so that they have lower numbers holding + /// in u8 (the number of non-gateway nodes is at most 256). /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, @@ -38,6 +53,15 @@ pub struct ClusterLayout { pub staging_hash: Hash, } +fn default_one() -> usize{ + return 1; +} +fn default_zero() -> u32{ + return 0; +} + +const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -66,16 +90,31 @@ impl NodeRole { None => "gateway".to_string(), } } + + pub fn tags_string(&self) -> String { + let mut tags = String::new(); + if self.tags.len() == 0 { + return tags + } + tags.push_str(&self.tags[0].clone()); + for t in 1..self.tags.len(){ + tags.push_str(","); + tags.push_str(&self.tags[t].clone()); + } + return tags; + } } impl ClusterLayout { - pub fn new(replication_factor: usize) -> Self { + pub fn new(replication_factor: usize, zone_redundancy: usize) -> Self { let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); ClusterLayout { version: 0, replication_factor, + zone_redundancy, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), @@ -122,6 +161,44 @@ impl ClusterLayout { } } + ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn useful_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => () + } + } + return result; + } + + ///Given a node uuids, this function returns the label of its zone + pub fn get_node_zone(&self, uuid : &Uuid) -> Result { + match self.node_role(uuid) { + Some(role) => return Ok(role.zone.clone()), + _ => return Err("The Uuid does not correspond to a node present in the cluster.".to_string()) + } + } + + ///Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), + _ => return Err("The Uuid does not correspond to a node present in the cluster or this node does not have a positive capacity.".to_string()) + } + } + + ///Returns the sum of capacities of non gateway nodes in the cluster + pub fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.useful_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + return Ok(total_capacity); + } + + /// Check a cluster layout for internal consistency /// returns true if consistent, false if error pub fn check(&self) -> bool { @@ -168,342 +245,412 @@ impl ClusterLayout { true } +} + +impl ClusterLayout { /// This function calculates a new partition-to-node assignation. - /// The computed assignation maximizes the capacity of a + /// The computed assignation respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of - /// data to be moved. A heuristic ensures node triplets - /// dispersion (in garage_util::bipartite::optimize_matching()). - pub fn calculate_partition_assignation(&mut self) -> bool { + /// data to be moved. + pub fn calculate_partition_assignation(&mut self, replication:usize, redundancy:usize) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - let old_node_assignation = self.update_nodes_and_ring(); - - let (node_zone, _) = self.get_node_zone_capacity(); - - //We compute the optimal number of partition to assign to - //every node and zone. - if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions() { - //We collect part_per_zone in a vec to not rely on the - //arbitrary order in which elements are iterated in - //Hashmap::iter() - let part_per_zone_vec = part_per_zone - .iter() - .map(|(x, y)| (x.clone(), *y)) - .collect::>(); - //We create an indexing of the zones - let mut zone_id = HashMap::::new(); - for (i, ppz) in part_per_zone_vec.iter().enumerate() { - zone_id.insert(ppz.0.clone(), i); - } - - //We compute a candidate for the new partition to zone - //assignation. - let nb_zones = part_per_zone.len(); - let nb_nodes = part_per_nod.len(); - let nb_partitions = 1 << PARTITION_BITS; - let left_cap_vec = vec![self.replication_factor as u32; nb_partitions]; - let right_cap_vec = part_per_zone_vec.iter().map(|(_, y)| *y as u32).collect(); - let mut zone_assignation = dinic_compute_matching(left_cap_vec, right_cap_vec); - - //We create the structure for the partition-to-node assignation. - let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; - //We will decrement part_per_nod to keep track of the number - //of partitions that we still have to associate. - let mut part_per_nod = part_per_nod; - - //We minimize the distance to the former assignation(if any) - - //We get the id of the zones of the former assignation - //(and the id no_zone if there is no node assignated) - let no_zone = part_per_zone_vec.len(); - let old_zone_assignation: Vec> = old_node_assignation - .iter() - .map(|x| { - x.iter() - .map(|id| match *id { - Some(i) => zone_id[&node_zone[i]], - None => no_zone, - }) - .collect() - }) - .collect(); - - //We minimize the distance to the former zone assignation - zone_assignation = - optimize_matching(&old_zone_assignation, &zone_assignation, nb_zones + 1); //+1 for no_zone - - //We need to assign partitions to nodes in their zone - //We first put the nodes assignation that can stay the same - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if let Some(Some(former_node)) = old_node_assignation[i].iter().find(|x| { - if let Some(id) = x { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - } else { - false - } - }) { - if part_per_nod[*former_node] > 0 { - node_assignation[i][j] = Some(*former_node); - part_per_nod[*former_node] -= 1; - } - } - } - } - - //We complete the assignation of partitions to nodes - let mut rng = rand::thread_rng(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if node_assignation[i][j] == None { - let possible_nodes: Vec = (0..nb_nodes) - .filter(|id| { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - && part_per_nod[*id] > 0 - }) - .collect(); - assert!(!possible_nodes.is_empty()); - //We randomly pick a node - if let Some(nod) = possible_nodes.choose(&mut rng) { - node_assignation[i][j] = Some(*nod); - part_per_nod[*nod] -= 1; - } - } - } - } - - //We write the assignation in the 1D table - self.ring_assignation_data = Vec::::new(); - for ass in node_assignation { - for nod in ass { - if let Some(id) = nod { - self.ring_assignation_data.push(id as CompactNodeType); - } else { - panic!() - } - } - } - - true - } else { - false - } - } + + //We update the node ids, since the node list might have changed with the staged + //changes in the layout. We retrieve the old_assignation reframed with the new ids + let old_assignation_opt = self.update_node_id_vec()?; + self.replication_factor = replication; + self.zone_redundancy = redundancy; + + let mut msg = Message::new(); + msg.push(format!("Computation of a new cluster layout where partitions are + replicated {} times on at least {} distinct zones.", replication, redundancy)); + + //We generate for once numerical ids for the zone, to use them as indices in the + //flow graphs. + let (id_to_zone , zone_to_id) = self.generate_zone_ids()?; + + msg.push(format!("The cluster contains {} nodes spread over {} zones.", + self.useful_nodes().len(), id_to_zone.len())); + + //We compute the optimal partition size + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + if old_assignation_opt != None { + msg.push(format!("Given the replication and redundancy constraint, the + optimal size of a partition is {}. In the previous layout, it used to + be {}.", partition_size, self.partition_size)); + } + else { + msg.push(format!("Given the replication and redundancy constraints, the + optimal size of a partition is {}.", partition_size)); + } + self.partition_size = partition_size; + + //We compute a first flow/assignment that is heuristically close to the previous + //assignment + let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; + + if let Some(assoc) = &old_assignation_opt { + //We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; + } + + msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); + + //We update the layout structure + self.update_ring_from_flow(id_to_zone.len() , &gflow)?; + return Ok(msg); + } /// The LwwMap of node roles might have changed. This function updates the node_id_vec /// and returns the assignation given by ring, with the new indices of the nodes, and - /// None of the node is not present anymore. + /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_nodes_and_ring(&mut self) -> Vec>> { - let nb_partitions = 1usize << PARTITION_BITS; - let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; - let rf = self.replication_factor; - let ring = &self.ring_assignation_data; - - let new_node_id_vec: Vec = self.roles.items().iter().map(|(k, _, _)| *k).collect(); - - if ring.len() == rf * nb_partitions { - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - node_assignation[i][j] = new_node_id_vec - .iter() - .position(|id| *id == self.node_id_vec[ring[i * rf + j] as usize]); - } - } - } - - self.node_id_vec = new_node_id_vec; - self.ring_assignation_data = vec![]; - node_assignation + fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,String> { + // (1) We compute the new node list + //Non gateway nodes should be coded on 8bits, hence they must be first in the list + //We build the new node ids + let mut new_non_gateway_nodes: Vec = self.roles.items().iter() + .filter(|(_, _, v)| + match &v.0 {Some(r) if r.capacity != None => true, _=> false }) + .map(|(k, _, _)| *k).collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(format!("There are more than {} non-gateway nodes in the new layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + } + + let mut new_gateway_nodes: Vec = self.roles.items().iter() + .filter(|(_, _, v)| + match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) + .map(|(k, _, _)| *k).collect(); + + let nb_useful_nodes = new_non_gateway_nodes.len(); + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.append(&mut new_non_gateway_nodes); + new_node_id_vec.append(&mut new_gateway_nodes); + + + // (2) We retrieve the old association + //We rewrite the old association with the new indices. We only consider partition + //to node assignations where the node is still in use. + let nb_partitions = 1usize << PARTITION_BITS; + let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; + + if self.ring_assignation_data.len() == 0 { + //This is a new association + return Ok(None); + } + if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + return Err("The old assignation does not have a size corresponding to the old replication factor or the number of partitions.".to_string()); + } + + //We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + //We add the indices of only the new non-gateway nodes that can be used in the + //association ring + for i in 0..nb_useful_nodes { + uuid_to_new_id.insert(new_node_id_vec[i], i ); + } + + let rf= self.replication_factor; + for p in 0..nb_partitions { + for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { + let uuid = self.node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assignation[p].push(uuid_to_new_id[&uuid]); + } + } + } + + //We write the results + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = Vec::::new(); + + return Ok(Some(old_assignation)); } - ///This function compute the number of partition to assign to - ///every node and zone, so that every partition is replicated - ///self.replication_factor times and the capacity of a partition - ///is maximized. - fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { - let mut zone_capacity: HashMap = HashMap::new(); - - let (node_zone, node_capacity) = self.get_node_zone_capacity(); - let nb_nodes = self.node_id_vec.len(); - - for i in 0..nb_nodes { - if zone_capacity.contains_key(&node_zone[i]) { - zone_capacity.insert( - node_zone[i].clone(), - zone_capacity[&node_zone[i]] + node_capacity[i], - ); - } else { - zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); - } - } - - //Compute the optimal number of partitions per zone - let sum_capacities: u32 = zone_capacity.values().sum(); - - if sum_capacities == 0 { - println!("No storage capacity in the network."); - return None; - } - - let nb_partitions = 1 << PARTITION_BITS; - //Initially we would like to use zones porportionally to - //their capacity. - //However, a large zone can be associated to at most - //nb_partitions to ensure replication of the date. - //So we take the min with nb_partitions: - let mut part_per_zone: HashMap = zone_capacity - .iter() - .map(|(k, v)| { - ( - k.clone(), - min( - nb_partitions, - (self.replication_factor * nb_partitions * *v as usize) - / sum_capacities as usize, - ), - ) - }) - .collect(); - - //The replication_factor-1 upper bounds the number of - //part_per_zones that are greater than nb_partitions - for _ in 1..self.replication_factor { - //The number of partitions that are not assignated to - //a zone that takes nb_partitions. - let sum_capleft: u32 = zone_capacity - .keys() - .filter(|k| part_per_zone[*k] < nb_partitions) - .map(|k| zone_capacity[k]) - .sum(); - - //The number of replication of the data that we need - //to ensure. - let repl_left = self.replication_factor - - part_per_zone - .values() - .filter(|x| **x == nb_partitions) - .count(); - if repl_left == 0 { - break; - } - - for k in zone_capacity.keys() { - if part_per_zone[k] != nb_partitions { - part_per_zone.insert( - k.to_string(), - min( - nb_partitions, - (nb_partitions * zone_capacity[k] as usize * repl_left) - / sum_capleft as usize, - ), - ); - } - } - } - - //Now we divide the zone's partition share proportionally - //between their nodes. - - let mut part_per_nod: Vec = (0..nb_nodes) - .map(|i| { - (part_per_zone[&node_zone[i]] * node_capacity[i] as usize) - / zone_capacity[&node_zone[i]] as usize - }) - .collect(); - - //We must update the part_per_zone to make it correspond to - //part_per_nod (because of integer rounding) - part_per_zone = part_per_zone.iter().map(|(k, _)| (k.clone(), 0)).collect(); - for i in 0..nb_nodes { - part_per_zone.insert( - node_zone[i].clone(), - part_per_zone[&node_zone[i]] + part_per_nod[i], - ); - } - - //Because of integer rounding, the total sum of part_per_nod - //might not be replication_factor*nb_partitions. - // We need at most to add 1 to every non maximal value of - // part_per_nod. The capacity of a partition will be bounded - // by the minimal value of - // node_capacity_vec[i]/part_per_nod[i] - // so we try to maximize this minimal value, keeping the - // part_per_zone capped - - let discrepancy: usize = - nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); - - //We use a stupid O(N^2) algorithm. If the number of nodes - //is actually expected to be high, one should optimize this. - - for _ in 0..discrepancy { - if let Some(idmax) = (0..nb_nodes) - .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) - .max_by(|i, j| { - (node_capacity[*i] * (part_per_nod[*j] + 1) as u32) - .cmp(&(node_capacity[*j] * (part_per_nod[*i] + 1) as u32)) - }) { - part_per_nod[idmax] += 1; - part_per_zone.insert( - node_zone[idmax].clone(), - part_per_zone[&node_zone[idmax]] + 1, - ); - } - } - - //We check the algorithm consistency - - let discrepancy: usize = - nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); - assert!(discrepancy == 0); - assert!(if let Some(v) = part_per_zone.values().max() { - *v <= nb_partitions - } else { - false - }); - - Some((part_per_nod, part_per_zone)) - } - - //Returns vectors of zone and capacity; indexed by the same (temporary) - //indices as node_id_vec. - fn get_node_zone_capacity(&self) -> (Vec, Vec) { - let node_zone = self - .node_id_vec - .iter() - .map(|id_nod| match self.node_role(id_nod) { - Some(NodeRole { - zone, - capacity: _, - tags: _, - }) => zone.clone(), - _ => "".to_string(), - }) - .collect(); - - let node_capacity = self - .node_id_vec - .iter() - .map(|id_nod| match self.node_role(id_nod) { - Some(NodeRole { - zone: _, - capacity: Some(c), - tags: _, - }) => *c, - _ => 0, - }) - .collect(); - - (node_zone, node_capacity) - } + ///This function generates ids for the zone of the nodes appearing in + ///self.node_id_vec. + fn generate_zone_ids(&self) -> Result<(Vec, HashMap),String>{ + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.node_id_vec.iter() { + if self.roles.get(uuid) == None { + return Err("The uuid was not found in the node roles (this should not happen, it might be a critical error).".to_string()); + } + match self.node_role(&uuid) { + Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone() , id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + _ => () + } + } + return Ok((id_to_zone, zone_to_id)); + } + + ///This function computes by dichotomy the largest realizable partition size, given + ///the layout. + fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ + let nb_partitions = 1usize << PARTITION_BITS; + let empty_set = HashSet::<(usize,usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { + return Err("The storage capacity of he cluster is to small. It is impossible to store partitions of size 1.".to_string()); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down +1 < s_up { + g = self.generate_flow_graph((s_down+s_up)/2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { + s_up = (s_down+s_up)/2; + } + else { + s_down = (s_down+s_up)/2; + } + } + + return Ok(s_down); + } + + fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + return vertices; + } + + fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, String> { + let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), + self.useful_nodes().len()); + let mut g= Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), self.zone_redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - self.zone_redundancy) as u32)?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; + g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , + self.replication_factor as u32)?; + } + } + for n in 0..self.useful_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity/size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p,n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + return Ok(g); + } + + + fn compute_candidate_assignment(&self, zone_to_id: &HashMap, + old_assoc_opt : &Option >>) -> Result, String > { + + //We list the edges that are not used in the old association + let mut exclude_edge = HashSet::<(usize,usize)>::new(); + if let Some(old_assoc) = old_assoc_opt { + let nb_nodes = self.useful_nodes().len(); + for p in 0..NB_PARTITIONS { + for n in 0..nb_nodes { + exclude_edge.insert((p,n)); + } + for n in old_assoc[p].iter() { + exclude_edge.remove(&(p,*n)); + } + } + } + + //We compute the best flow using only the edges used in the old assoc + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge )?; + g.compute_maximal_flow()?; + for (p,n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + return Ok(g); + } + + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), String > { + let mut cost = CostFunction::new(); + for p in 0..NB_PARTITIONS { + for n in old_assoc[p].iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); + } + } + let nb_nodes = self.useful_nodes().len(); + let path_length = 4*nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + return Ok(()); + } + + fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), String>{ + self.ring_assignation_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; + for vertex in assoc_vertex.iter() { + match vertex{ + Vertex::N(n) => self.ring_assignation_data.push((*n).try_into().unwrap()), + _ => () + } + } + } + } + + if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { + return Err("Critical Error : the association ring we produced does not have the right size.".to_string()); + } + return Ok(()); + } + + + //This function returns a message summing up the partition repartition of the new + //layout. + fn output_stat(&self , gflow : &Graph, + old_assoc_opt : &Option< Vec> >, + zone_to_id: &HashMap, + id_to_zone : &Vec) -> Result{ + let mut msg = Message::new(); + + let nb_partitions = 1usize << PARTITION_BITS; + let used_cap = self.partition_size * nb_partitions as u32 * + self.replication_factor as u32; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); + msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + used_cap , total_cap , percent_cap )); + msg.push(format!("If the percentage is to low, it might be that the replication/redundancy constraints force the use of nodes/zones with small storage capacities. + You might want to rebalance the storage capacities or relax the constraints. See the detailed statistics below and look for saturated nodes/zones.")); + msg.push(format!("Recall that because of the replication, the actual available storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); + + //We define and fill in the following tables + let storing_nodes = self.useful_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..nb_partitions { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; + if pz_nodes.len() > 0 { + stored_partitions_zone[z] += 1; + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(old_assoc) = old_assoc_opt { + if !old_assoc[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p.push( + zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + } + + //We display the statistics + + if *old_assoc_opt != None { + let total_new_partitions : usize = new_partitions.iter().sum(); + msg.push(format!("A total of {} new copies of partitions need to be \ + transferred.", total_new_partitions)); + } + msg.push(format!("")); + msg.push(format!("Detailed statistics by zones and nodes.")); + + for z in 0..id_to_zone.len(){ + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len(){ + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions : usize = nodes_of_z.iter() + .map(|n| stored_partitions[*n]).sum(); + msg.push(format!("")); + + if *old_assoc_opt != None { + msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], + new_partitions_zone[z], replicated_partitions)); + } + else{ + msg.push(format!("Zone {}: {} distinct partitions stored ({} partition \ + copies) ", + id_to_zone[z], stored_partitions_zone[z], replicated_partitions)); + } + + let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); + msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", + available_cap_z, total_cap_z, percent_cap_z)); + msg.push(format!("")); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; + let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self.node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))?.tags_string(); + msg.push(format!(" Node {}: {} partitions ({} new) ; \ + available/total capacity: {} / {} ({:.1}%) ; tags:{}", + &self.node_id_vec[*n].to_vec().encode_hex::(), + stored_partitions[*n], + new_partitions[*n], available_cap_n, total_cap_n, + (available_cap_n as f32)/(total_cap_n as f32)*100.0 , + tags_n)); + } + } + + return Ok(msg); + } + } +//==================================================================================== + #[cfg(test)] mod tests { use super::*; -- cgit v1.2.3 From bd842e1388a324e2a3956465e9b32d0dc739a8d9 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 22 Sep 2022 19:30:01 +0200 Subject: Correction of a few bugs in the tests, modification of ClusterLayout::check --- src/rpc/layout.rs | 173 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 119 insertions(+), 54 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index ff60ce98..a878f19c 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::collections::HashSet; use hex::ToHex; +use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -185,7 +186,8 @@ impl ClusterLayout { pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err("The Uuid does not correspond to a node present in the cluster or this node does not have a positive capacity.".to_string()) + _ => return Err("The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity.".to_string()) } } @@ -242,6 +244,47 @@ impl ClusterLayout { } } + //Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignation_data[rf*p..rf*(p+1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return false; + } + //Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p.iter() + .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.")); + if zones_of_p.unique().count() < self.zone_redundancy { + return false; + } + } + + //Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignation_data.iter() { + node_usage[*n as usize] += 1; + } + for n in 0..MAX_NODE_NUMBER { + if node_usage[n] > 0 { + let uuid = self.node_id_vec[n]; + if node_usage[n]*self.partition_size > self.get_node_capacity(&uuid) + .expect("Critical Error"){ + return false; + } + } + } + + //Check that the partition size stored is the one computed by the asignation + //algorithm. + let cl2 = self.clone(); + let (_ , zone_to_id) = cl2.generate_zone_ids().expect("Critical Error"); + let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); + if partition_size != self.partition_size { + return false; + } + + true } @@ -267,7 +310,7 @@ impl ClusterLayout { self.zone_redundancy = redundancy; let mut msg = Message::new(); - msg.push(format!("Computation of a new cluster layout where partitions are + msg.push(format!("Computation of a new cluster layout where partitions are \ replicated {} times on at least {} distinct zones.", replication, redundancy)); //We generate for once numerical ids for the zone, to use them as indices in the @@ -276,16 +319,19 @@ impl ClusterLayout { msg.push(format!("The cluster contains {} nodes spread over {} zones.", self.useful_nodes().len(), id_to_zone.len())); - + //We compute the optimal partition size + //Capacities should be given in a unit so that partition size is at least 100. + //In this case, integer rounding plays a marginal role in the percentages of + //optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; if old_assignation_opt != None { - msg.push(format!("Given the replication and redundancy constraint, the - optimal size of a partition is {}. In the previous layout, it used to + msg.push(format!("Given the replication and redundancy constraint, the \ + optimal size of a partition is {}. In the previous layout, it used to \ be {}.", partition_size, self.partition_size)); } else { - msg.push(format!("Given the replication and redundancy constraints, the + msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; @@ -293,13 +339,13 @@ impl ClusterLayout { //We compute a first flow/assignment that is heuristically close to the previous //assignment let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { //We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; } msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); + msg.push("".to_string()); //We update the layout structure self.update_ring_from_flow(id_to_zone.len() , &gflow)?; @@ -321,7 +367,8 @@ impl ClusterLayout { .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(format!("There are more than {} non-gateway nodes in the new layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + return Err(format!("There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); } let mut new_gateway_nodes: Vec = self.roles.items().iter() @@ -346,7 +393,8 @@ impl ClusterLayout { return Ok(None); } if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err("The old assignation does not have a size corresponding to the old replication factor or the number of partitions.".to_string()); + return Err("The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions.".to_string()); } //We build a translation table between the uuid and new ids @@ -384,7 +432,8 @@ impl ClusterLayout { for uuid in self.node_id_vec.iter() { if self.roles.get(uuid) == None { - return Err("The uuid was not found in the node roles (this should not happen, it might be a critical error).".to_string()); + return Err("The uuid was not found in the node roles (this should \ + not happen, it might be a critical error).".to_string()); } match self.node_role(&uuid) { Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { @@ -405,7 +454,8 @@ impl ClusterLayout { let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err("The storage capacity of he cluster is to small. It is impossible to store partitions of size 1.".to_string()); + return Err("The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1.".to_string()); } let mut s_down = 1; @@ -525,11 +575,12 @@ impl ClusterLayout { } if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err("Critical Error : the association ring we produced does not have the right size.".to_string()); + return Err("Critical Error : the association ring we produced does not \ + have the right size.".to_string()); } return Ok(()); } - + //This function returns a message summing up the partition repartition of the new //layout. @@ -546,9 +597,16 @@ impl ClusterLayout { let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap , total_cap , percent_cap )); - msg.push(format!("If the percentage is to low, it might be that the replication/redundancy constraints force the use of nodes/zones with small storage capacities. - You might want to rebalance the storage capacities or relax the constraints. See the detailed statistics below and look for saturated nodes/zones.")); - msg.push(format!("Recall that because of the replication, the actual available storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); + msg.push(format!("")); + msg.push(format!("If the percentage is to low, it might be that the \ + replication/redundancy constraints force the use of nodes/zones with small \ + storage capacities. \ + You might want to rebalance the storage capacities or relax the constraints. \ + See the detailed statistics below and look for saturated nodes/zones.")); + msg.push(format!("Recall that because of the replication, the actual available \ + storage capacity is {} / {} = {}.", + used_cap , self.replication_factor , + used_cap/self.replication_factor as u32)); //We define and fill in the following tables let storing_nodes = self.useful_nodes(); @@ -563,6 +621,16 @@ impl ClusterLayout { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; if pz_nodes.len() > 0 { stored_partitions_zone[z] += 1; + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p.push( + zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } } for vert in pz_nodes.iter() { if let Vertex::N(n) = *vert { @@ -574,21 +642,17 @@ impl ClusterLayout { } } } - if let Some(old_assoc) = old_assoc_opt { - let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { - old_zones_of_p.push( - zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } } } + + if *old_assoc_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } //We display the statistics + msg.push(format!("")); if *old_assoc_opt != None { let total_new_partitions : usize = new_partitions.iter().sum(); msg.push(format!("A total of {} new copies of partitions need to be \ @@ -608,16 +672,9 @@ impl ClusterLayout { .map(|n| stored_partitions[*n]).sum(); msg.push(format!("")); - if *old_assoc_opt != None { - msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ + msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], new_partitions_zone[z], replicated_partitions)); - } - else{ - msg.push(format!("Zone {}: {} distinct partitions stored ({} partition \ - copies) ", - id_to_zone[z], stored_partitions_zone[z], replicated_partitions)); - } let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; let mut total_cap_z = 0; @@ -625,18 +682,17 @@ impl ClusterLayout { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); - msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", + msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", available_cap_z, total_cap_z, percent_cap_z)); - msg.push(format!("")); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; let tags_n = (self.node_role(&self.node_id_vec[*n]) .ok_or("Node not found."))?.tags_string(); - msg.push(format!(" Node {}: {} partitions ({} new) ; \ + msg.push(format!(" Node {}: {} partitions ({} new) ; \ available/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec().encode_hex::(), + &self.node_id_vec[*n].to_vec()[0..2].to_vec().encode_hex::(), stored_partitions[*n], new_partitions[*n], available_cap_n, total_cap_n, (available_cap_n as f32)/(total_cap_n as f32)*100.0 , @@ -654,16 +710,14 @@ impl ClusterLayout { #[cfg(test)] mod tests { use super::*; - use itertools::Itertools; - + use std::io::*; +// use itertools::Itertools; +/* fn check_assignation(cl: &ClusterLayout) { //Check that input data has the right format let nb_partitions = 1usize << PARTITION_BITS; - assert!([1, 2, 3].contains(&cl.replication_factor)); assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); - let (node_zone, node_capacity) = cl.get_node_zone_capacity(); - //Check that is is a correct assignation with zone redundancy let rf = cl.replication_factor; for i in 0..nb_partitions { @@ -743,6 +797,13 @@ mod tests { } } } +*/ + + fn show_msg(msg : &Message) { + for s in msg.iter(){ + println!("{}",s); + } + } fn update_layout( cl: &mut ClusterLayout, @@ -769,7 +830,8 @@ mod tests { #[test] fn test_assignation() { - let mut node_id_vec = vec![1, 2, 3]; + std::io::stdout().flush().ok().expect("Could not flush stdout"); + let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] .into_iter() @@ -782,14 +844,16 @@ mod tests { roles: LwwMap::new(), replication_factor: 3, + zone_redundancy: 1, + partition_size: 0, ring_assignation_data: vec![], version: 0, staging: LwwMap::new(), - staging_hash: sha256sum(&[1; 32]), + staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), }; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -798,17 +862,18 @@ mod tests { .map(|x| x.to_string()) .collect(); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); - node_capacity_vec = vec![4000, 4000, 2000, 7000, 1000, 9000, 2000, 10, 2000]; + node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + assert!(cl.check()); + } } -- cgit v1.2.3 From ceac3713d6639f9170fc3b4475fae4a30b34483c Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 5 Oct 2022 15:29:48 +0200 Subject: modifications in several files to : - have consistent error return types - store the zone redundancy in a Lww - print the error and message in the CLI (TODO: for the server Api, should msg be returned in the body response?) --- src/rpc/layout.rs | 118 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 46 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 16d573c7..8d2b3e17 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; +use garage_util::crdt::{AutoCrdt, Crdt, LwwMap, Lww}; use garage_util::data::*; use garage_util::error::*; @@ -27,12 +27,10 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, - #[serde(default="default_one")] - pub zone_redundancy: usize, //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_zero")] + #[serde(default="default_partition_size")] pub partition_size: u32, pub roles: LwwMap, @@ -51,17 +49,31 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout + #[serde(default="default_layout_parameters")] + pub parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_one() -> usize{ - return 1; -} -fn default_zero() -> u32{ +fn default_partition_size() -> u32{ return 0; } +fn default_layout_parameters() -> Lww{ + Lww::::new(LayoutParameters{ zone_redundancy: 1}) +} + +///This struct is used to set the parameters to be used in the assignation computation +///algorithm. It is stored as a Crdt. +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct LayoutParameters { + pub zone_redundancy:usize, +} + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -108,18 +120,24 @@ impl NodeRole { } impl ClusterLayout { - pub fn new(replication_factor: usize, zone_redundancy: usize) -> Self { + pub fn new(replication_factor: usize) -> Self { + + //We set the default zone redundancy to be equal to the replication factor, + //i.e. as strict as possible. + let default_parameters = Lww::::new( + LayoutParameters{ zone_redundancy: replication_factor}); + let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); ClusterLayout { version: 0, replication_factor, - zone_redundancy, partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), + parameters: default_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -132,6 +150,7 @@ impl ClusterLayout { true } Ordering::Equal => { + self.parameters.merge(&other.parameters); self.staging.merge(&other.staging); let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); @@ -145,7 +164,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option) -> Result { + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self,Message), Error> { match version { None => { let error = r#" @@ -164,16 +183,14 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging); self.roles.retain(|(_, _, v)| v.0.is_some()); - if !self.calculate_partition_assignation() { - return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); - } + let msg = self.calculate_partition_assignation()?; self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); self.version += 1; - Ok(self) + Ok((self,msg)) } pub fn revert_staged_changes(mut self, version: Option) -> Result { @@ -231,24 +248,24 @@ To know the correct value of the new layout version, invoke `garage layout show` } ///Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid : &Uuid) -> Result { + pub fn get_node_zone(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(role) => return Ok(role.zone.clone()), - _ => return Err("The Uuid does not correspond to a node present in the cluster.".to_string()) + _ => return Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) } } ///Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err("The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity.".to_string()) + _ => return Err(Error::Message("The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity.".into())) } } ///Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.useful_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -311,7 +328,8 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - if zones_of_p.unique().count() < self.zone_redundancy { + let redundancy = self.parameters.get().zone_redundancy; + if zones_of_p.unique().count() < redundancy { return false; } } @@ -354,7 +372,7 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - pub fn calculate_partition_assignation(&mut self, replication:usize, redundancy:usize) -> Result { + pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. @@ -362,12 +380,12 @@ impl ClusterLayout { //We update the node ids, since the node list might have changed with the staged //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - self.replication_factor = replication; - self.zone_redundancy = redundancy; + let redundancy = self.parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ - replicated {} times on at least {} distinct zones.", replication, redundancy)); + replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); //We generate for once numerical ids for the zone, to use them as indices in the //flow graphs. @@ -381,6 +399,7 @@ impl ClusterLayout { //In this case, integer rounding plays a marginal role in the percentages of //optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ @@ -392,6 +411,12 @@ impl ClusterLayout { } self.partition_size = partition_size; + if partition_size < 100 { + msg.push("WARNING: The partition size is low (< 100), you might consider to \ + give the nodes capacities in a smaller unit (e.g. Mb instead of Gb) to \ + achieve a more tailored use of your storage ressources.".into()); + } + //We compute a first flow/assignment that is heuristically close to the previous //assignment let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; @@ -413,7 +438,7 @@ impl ClusterLayout { /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,String> { + fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,Error> { // (1) We compute the new node list //Non gateway nodes should be coded on 8bits, hence they must be first in the list //We build the new node ids @@ -423,8 +448,8 @@ impl ClusterLayout { .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", MAX_NODE_NUMBER).into() )); } let mut new_gateway_nodes: Vec = self.roles.items().iter() @@ -449,8 +474,8 @@ impl ClusterLayout { return Ok(None); } if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err("The old assignation does not have a size corresponding to \ - the old replication factor or the number of partitions.".to_string()); + return Err(Error::Message("The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions.".into())); } //We build a translation table between the uuid and new ids @@ -482,14 +507,14 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_zone_ids(&self) -> Result<(Vec, HashMap),String>{ + fn generate_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); for uuid in self.node_id_vec.iter() { if self.roles.get(uuid) == None { - return Err("The uuid was not found in the node roles (this should \ - not happen, it might be a critical error).".to_string()); + return Err(Error::Message("The uuid was not found in the node roles (this should \ + not happen, it might be a critical error).".into())); } match self.node_role(&uuid) { Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { @@ -504,14 +529,14 @@ impl ClusterLayout { ///This function computes by dichotomy the largest realizable partition size, given ///the layout. - fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ + fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ let nb_partitions = 1usize << PARTITION_BITS; let empty_set = HashSet::<(usize,usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err("The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1.".to_string()); + return Err(Error::Message("The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1.".into())); } let mut s_down = 1; @@ -545,14 +570,15 @@ impl ClusterLayout { return vertices; } - fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, String> { + fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); + let redundancy = self.parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), self.zone_redundancy as u32)?; - g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - self.zone_redundancy) as u32)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , @@ -574,7 +600,7 @@ impl ClusterLayout { fn compute_candidate_assignment(&self, zone_to_id: &HashMap, - old_assoc_opt : &Option >>) -> Result, String > { + old_assoc_opt : &Option >>) -> Result, Error > { //We list the edges that are not used in the old association let mut exclude_edge = HashSet::<(usize,usize)>::new(); @@ -601,7 +627,7 @@ impl ClusterLayout { return Ok(g); } - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), String > { + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), Error > { let mut cost = CostFunction::new(); for p in 0..NB_PARTITIONS { for n in old_assoc[p].iter() { @@ -616,7 +642,7 @@ impl ClusterLayout { return Ok(()); } - fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), String>{ + fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ self.ring_assignation_data = Vec::::new(); for p in 0..NB_PARTITIONS { for z in 0..nb_zones { @@ -631,8 +657,8 @@ impl ClusterLayout { } if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err("Critical Error : the association ring we produced does not \ - have the right size.".to_string()); + return Err(Error::Message("Critical Error : the association ring we produced does not \ + have the right size.".into())); } return Ok(()); } @@ -643,7 +669,7 @@ impl ClusterLayout { fn output_stat(&self , gflow : &Graph, old_assoc_opt : &Option< Vec> >, zone_to_id: &HashMap, - id_to_zone : &Vec) -> Result{ + id_to_zone : &Vec) -> Result{ let mut msg = Message::new(); let nb_partitions = 1usize << PARTITION_BITS; -- cgit v1.2.3 From 9407df60cc00fc70c10f73bc4b600085789d5353 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 12:54:51 +0200 Subject: Corrected two bugs: - self.node_id_vec was not properly updated when the previous ring was empty - ClusterLayout::merge was not considering changes in the layout parameters --- src/rpc/layout.rs | 56 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 8d2b3e17..89c18c68 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -150,15 +150,17 @@ impl ClusterLayout { true } Ordering::Equal => { + let param_changed = self.parameters.get() != other.parameters.get(); self.parameters.merge(&other.parameters); self.staging.merge(&other.staging); + let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - let changed = new_staging_hash != self.staging_hash; + let stage_changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; - changed + stage_changed || param_changed } Ordering::Less => false, } @@ -352,7 +354,7 @@ To know the correct value of the new layout version, invoke `garage layout show` //Check that the partition size stored is the one computed by the asignation //algorithm. let cl2 = self.clone(); - let (_ , zone_to_id) = cl2.generate_zone_ids().expect("Critical Error"); + let (_ , zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); if partition_size != self.partition_size { return false; @@ -371,13 +373,14 @@ impl ClusterLayout { /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of - /// data to be moved. + /// data to be moved. + /// Staged changes must be merged with nodes roles before calling this function. pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - - //We update the node ids, since the node list might have changed with the staged + + //We update the node ids, since the node role list might have changed with the //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; @@ -387,12 +390,23 @@ impl ClusterLayout { msg.push(format!("Computation of a new cluster layout where partitions are \ replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); - //We generate for once numerical ids for the zone, to use them as indices in the - //flow graphs. - let (id_to_zone , zone_to_id) = self.generate_zone_ids()?; + //We generate for once numerical ids for the zones of non gateway nodes, + //to use them as indices in the flow graphs. + let (id_to_zone , zone_to_id) = self.generate_useful_zone_ids()?; + let nb_useful_nodes = self.useful_nodes().len(); msg.push(format!("The cluster contains {} nodes spread over {} zones.", - self.useful_nodes().len(), id_to_zone.len())); + nb_useful_nodes, id_to_zone.len())); + if nb_useful_nodes < self.replication_factor{ + return Err(Error::Message(format!("The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_useful_nodes, self.replication_factor))); + } + if id_to_zone.len() < redundancy { + return Err(Error::Message(format!("The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len() , redundancy))); + } //We compute the optimal partition size //Capacities should be given in a unit so that partition size is at least 100. @@ -413,8 +427,7 @@ impl ClusterLayout { if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ - give the nodes capacities in a smaller unit (e.g. Mb instead of Gb) to \ - achieve a more tailored use of your storage ressources.".into()); + provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb).".into()); } //We compute a first flow/assignment that is heuristically close to the previous @@ -456,12 +469,14 @@ impl ClusterLayout { .filter(|(_, _, v)| match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) .map(|(k, _, _)| *k).collect(); - + let nb_useful_nodes = new_non_gateway_nodes.len(); let mut new_node_id_vec = Vec::::new(); new_node_id_vec.append(&mut new_non_gateway_nodes); new_node_id_vec.append(&mut new_gateway_nodes); + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); // (2) We retrieve the old association //We rewrite the old association with the new indices. We only consider partition @@ -490,15 +505,14 @@ impl ClusterLayout { let rf= self.replication_factor; for p in 0..nb_partitions { for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { - let uuid = self.node_id_vec[*old_id as usize]; + let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { old_assignation[p].push(uuid_to_new_id[&uuid]); } } } - //We write the results - self.node_id_vec = new_node_id_vec; + //We write the ring self.ring_assignation_data = Vec::::new(); return Ok(Some(old_assignation)); @@ -507,11 +521,11 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ + fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - - for uuid in self.node_id_vec.iter() { + + for uuid in self.useful_nodes().iter() { if self.roles.get(uuid) == None { return Err(Error::Message("The uuid was not found in the node roles (this should \ not happen, it might be a critical error).".into())); @@ -685,7 +699,7 @@ impl ClusterLayout { storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ See the detailed statistics below and look for saturated nodes/zones.")); - msg.push(format!("Recall that because of the replication, the actual available \ + msg.push(format!("Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); @@ -741,7 +755,7 @@ impl ClusterLayout { transferred.", total_new_partitions)); } msg.push(format!("")); - msg.push(format!("Detailed statistics by zones and nodes.")); + msg.push(format!("==== DETAILED STATISTICS BY ZONES AND NODES ====")); for z in 0..id_to_zone.len(){ let mut nodes_of_z = Vec::::new(); -- cgit v1.2.3 From 911eb17bd9e25f2f02fbe1de81a3384e99ea13ac Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 14:53:57 +0200 Subject: corrected warnings of cargo clippy --- src/rpc/layout.rs | 111 ++++++++++++++++++++++++++---------------------------- 1 file changed, 53 insertions(+), 58 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 89c18c68..1969b721 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -56,7 +56,7 @@ pub struct ClusterLayout { } fn default_partition_size() -> u32{ - return 0; + 0 } fn default_layout_parameters() -> Lww{ @@ -107,15 +107,15 @@ impl NodeRole { pub fn tags_string(&self) -> String { let mut tags = String::new(); - if self.tags.len() == 0 { + if self.tags.is_empty() { return tags } tags.push_str(&self.tags[0].clone()); for t in 1..self.tags.len(){ - tags.push_str(","); + tags.push(','); tags.push_str(&self.tags[t].clone()); } - return tags; + tags } } @@ -246,22 +246,22 @@ To know the correct value of the new layout version, invoke `garage layout show` _ => () } } - return result; + result } ///Given a node uuids, this function returns the label of its zone pub fn get_node_zone(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { - Some(role) => return Ok(role.zone.clone()), - _ => return Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) } } ///Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { - Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err(Error::Message("The Uuid does not correspond to a node present in the \ + Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => Ok(*cap), + _ => Err(Error::Message("The Uuid does not correspond to a node present in the \ cluster or this node does not have a positive capacity.".into())) } } @@ -272,7 +272,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for uuid in self.useful_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; } - return Ok(total_capacity); + Ok(total_capacity) } @@ -341,10 +341,10 @@ To know the correct value of the new layout version, invoke `garage layout show` for n in self.ring_assignation_data.iter() { node_usage[*n as usize] += 1; } - for n in 0..MAX_NODE_NUMBER { - if node_usage[n] > 0 { + for (n, usage) in node_usage.iter().enumerate(){ + if *usage > 0 { let uuid = self.node_id_vec[n]; - if node_usage[n]*self.partition_size > self.get_node_capacity(&uuid) + if usage*self.partition_size > self.get_node_capacity(&uuid) .expect("Critical Error"){ return false; } @@ -435,7 +435,7 @@ impl ClusterLayout { let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { //We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); @@ -443,7 +443,7 @@ impl ClusterLayout { //We update the layout structure self.update_ring_from_flow(id_to_zone.len() , &gflow)?; - return Ok(msg); + Ok(msg) } /// The LwwMap of node roles might have changed. This function updates the node_id_vec @@ -456,21 +456,18 @@ impl ClusterLayout { //Non gateway nodes should be coded on 8bits, hence they must be first in the list //We build the new node ids let mut new_non_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| - match &v.0 {Some(r) if r.capacity != None => true, _=> false }) + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER).into() )); + layout. This is not allowed.", MAX_NODE_NUMBER) )); } let mut new_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| - match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) .map(|(k, _, _)| *k).collect(); - let nb_useful_nodes = new_non_gateway_nodes.len(); let mut new_node_id_vec = Vec::::new(); new_node_id_vec.append(&mut new_non_gateway_nodes); new_node_id_vec.append(&mut new_gateway_nodes); @@ -484,7 +481,7 @@ impl ClusterLayout { let nb_partitions = 1usize << PARTITION_BITS; let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; - if self.ring_assignation_data.len() == 0 { + if self.ring_assignation_data.is_empty() { //This is a new association return Ok(None); } @@ -498,16 +495,16 @@ impl ClusterLayout { //We add the indices of only the new non-gateway nodes that can be used in the //association ring - for i in 0..nb_useful_nodes { - uuid_to_new_id.insert(new_node_id_vec[i], i ); + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i ); } let rf= self.replication_factor; - for p in 0..nb_partitions { + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { - old_assignation[p].push(uuid_to_new_id[&uuid]); + old_assign_p.push(uuid_to_new_id[&uuid]); } } } @@ -515,7 +512,7 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); - return Ok(Some(old_assignation)); + Ok(Some(old_assignation)) } @@ -530,15 +527,14 @@ impl ClusterLayout { return Err(Error::Message("The uuid was not found in the node roles (this should \ not happen, it might be a critical error).".into())); } - match self.node_role(&uuid) { - Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone() , id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - _ => () + if let Some(r) = self.node_role(uuid) { + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone() , id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } } } - return Ok((id_to_zone, zone_to_id)); + Ok((id_to_zone, zone_to_id)) } ///This function computes by dichotomy the largest realizable partition size, given @@ -566,7 +562,7 @@ impl ClusterLayout { } } - return Ok(s_down); + Ok(s_down) } fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { @@ -581,7 +577,7 @@ impl ClusterLayout { for n in 0..nb_nodes { vertices.push(Vertex::N(n)); } - return vertices; + vertices } fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { @@ -609,7 +605,7 @@ impl ClusterLayout { } } } - return Ok(g); + Ok(g) } @@ -620,11 +616,11 @@ impl ClusterLayout { let mut exclude_edge = HashSet::<(usize,usize)>::new(); if let Some(old_assoc) = old_assoc_opt { let nb_nodes = self.useful_nodes().len(); - for p in 0..NB_PARTITIONS { + for (p, old_assoc_p) in old_assoc.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p,n)); } - for n in old_assoc[p].iter() { + for n in old_assoc_p.iter() { exclude_edge.remove(&(p,*n)); } } @@ -638,13 +634,13 @@ impl ClusterLayout { g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; - return Ok(g); + Ok(g) } - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), Error > { + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &[Vec ]) -> Result<(), Error > { let mut cost = CostFunction::new(); - for p in 0..NB_PARTITIONS { - for n in old_assoc[p].iter() { + for (p, assoc_p) in old_assoc.iter().enumerate(){ + for n in assoc_p.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); } @@ -653,7 +649,7 @@ impl ClusterLayout { let path_length = 4*nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; - return Ok(()); + Ok(()) } fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ @@ -662,9 +658,8 @@ impl ClusterLayout { for z in 0..nb_zones { let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; for vertex in assoc_vertex.iter() { - match vertex{ - Vertex::N(n) => self.ring_assignation_data.push((*n).try_into().unwrap()), - _ => () + if let Vertex::N(n) = vertex { + self.ring_assignation_data.push((*n).try_into().unwrap()); } } } @@ -674,7 +669,7 @@ impl ClusterLayout { return Err(Error::Message("Critical Error : the association ring we produced does not \ have the right size.".into())); } - return Ok(()); + Ok(()) } @@ -683,7 +678,7 @@ impl ClusterLayout { fn output_stat(&self , gflow : &Graph, old_assoc_opt : &Option< Vec> >, zone_to_id: &HashMap, - id_to_zone : &Vec) -> Result{ + id_to_zone : &[String]) -> Result{ let mut msg = Message::new(); let nb_partitions = 1usize << PARTITION_BITS; @@ -693,12 +688,12 @@ impl ClusterLayout { let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap , total_cap , percent_cap )); - msg.push(format!("")); - msg.push(format!("If the percentage is to low, it might be that the \ + msg.push("".into()); + msg.push("If the percentage is to low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones.")); + See the detailed statistics below and look for saturated nodes/zones.".into()); msg.push(format!("Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", used_cap , self.replication_factor , @@ -715,7 +710,7 @@ impl ClusterLayout { for p in 0..nb_partitions { for z in 0..id_to_zone.len() { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - if pz_nodes.len() > 0 { + if !pz_nodes.is_empty() { stored_partitions_zone[z] += 1; if let Some(old_assoc) = old_assoc_opt { let mut old_zones_of_p = Vec::::new(); @@ -748,14 +743,14 @@ impl ClusterLayout { //We display the statistics - msg.push(format!("")); + msg.push("".into()); if *old_assoc_opt != None { let total_new_partitions : usize = new_partitions.iter().sum(); msg.push(format!("A total of {} new copies of partitions need to be \ transferred.", total_new_partitions)); } - msg.push(format!("")); - msg.push(format!("==== DETAILED STATISTICS BY ZONES AND NODES ====")); + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); for z in 0..id_to_zone.len(){ let mut nodes_of_z = Vec::::new(); @@ -766,7 +761,7 @@ impl ClusterLayout { } let replicated_partitions : usize = nodes_of_z.iter() .map(|n| stored_partitions[*n]).sum(); - msg.push(format!("")); + msg.push("".into()); msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], @@ -796,7 +791,7 @@ impl ClusterLayout { } } - return Ok(msg); + Ok(msg) } } -- cgit v1.2.3 From fcf9ac674a2842b2b55d933e60af5af93dcc4592 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:19:25 +0200 Subject: Tests written in layout.rs added staged_parameters to ClusterLayout removed the serde(default) -> will need a migration function --- src/rpc/layout.rs | 232 ++++++++++++++++++++++++------------------------------ 1 file changed, 104 insertions(+), 128 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 1969b721..976f94af 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -30,8 +30,8 @@ pub struct ClusterLayout { //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_partition_size")] pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -49,20 +49,11 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - #[serde(default="default_layout_parameters")] - pub parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_partition_size() -> u32{ - 0 -} - -fn default_layout_parameters() -> Lww{ - Lww::::new(LayoutParameters{ zone_redundancy: 1}) -} - ///This struct is used to set the parameters to be used in the assignation computation ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -124,8 +115,8 @@ impl ClusterLayout { //We set the default zone redundancy to be equal to the replication factor, //i.e. as strict as possible. - let default_parameters = Lww::::new( - LayoutParameters{ zone_redundancy: replication_factor}); + let parameters = LayoutParameters{ zone_redundancy: replication_factor}; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -137,7 +128,8 @@ impl ClusterLayout { roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters: default_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -150,8 +142,8 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.parameters.get() != other.parameters.get(); - self.parameters.merge(&other.parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); @@ -330,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; if zones_of_p.unique().count() < redundancy { return false; } @@ -384,7 +376,8 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ @@ -417,13 +410,15 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {}.", partition_size, self.partition_size)); + be {} (the zone redundancy was {}).", partition_size, self.partition_size, + self.parameters.zone_redundancy)); } else { msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ @@ -511,6 +506,10 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); + } Ok(Some(old_assignation)) } @@ -585,7 +584,7 @@ impl ClusterLayout { self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; @@ -800,96 +799,80 @@ impl ClusterLayout { #[cfg(test)] mod tests { - use super::*; - use std::io::*; -// use itertools::Itertools; -/* - fn check_assignation(cl: &ClusterLayout) { - //Check that input data has the right format - let nb_partitions = 1usize << PARTITION_BITS; - assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); - - //Check that is is a correct assignation with zone redundancy - let rf = cl.replication_factor; - for i in 0..nb_partitions { - assert!( - rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] - .iter() - .map(|nod| node_zone[*nod as usize].clone()) - .unique() - .count() - ); - } + use super::{*,Error}; + use std::cmp::min; + + + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size +1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } - let nb_nodes = cl.node_id_vec.len(); - //Check optimality - let node_nb_part = (0..nb_nodes) - .map(|i| { - cl.ring_assignation_data - .iter() - .filter(|x| **x == i as u8) - .count() - }) - .collect::>(); + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec - .iter() - .map(|z| { - cl.ring_assignation_data - .iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - }) - .collect::>(); + let mut id_zone_token = vec![0; zones.len()]; + for (z,t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition - - if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = (0..nb_nodes) - .filter(|i| { - if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { - zone_nb_part[p] < nb_partitions - } else { - false - } - }) - .max_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 || + (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); - if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = node_of_z_iter.min_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } - } - } -*/ - fn show_msg(msg : &Message) { for s in msg.iter(){ println!("{}",s); @@ -901,6 +884,7 @@ mod tests { node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, + zone_redundancy: usize ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -917,11 +901,11 @@ mod tests { ); cl.roles.merge(&update); } + cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); } #[test] fn test_assignation() { - std::io::stdout().flush().ok().expect("Could not flush stdout"); let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] @@ -929,22 +913,11 @@ mod tests { .map(|x| x.to_string()) .collect(); - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles: LwwMap::new(), - - replication_factor: 3, - zone_redundancy: 1, - partition_size: 0, - ring_assignation_data: vec![], - version: 0, - staging: LwwMap::new(), - staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + let mut cl = ClusterLayout::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -952,19 +925,22 @@ mod tests { .into_iter() .map(|x| x.to_string()) .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); } } -- cgit v1.2.3 From 4abab246f1113a9a1988fdfca81c1dd8ffa323c8 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:21:13 +0200 Subject: cargo fmt --- src/rpc/layout.rs | 1332 +++++++++++++++++++++++++++++------------------------ 1 file changed, 725 insertions(+), 607 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 976f94af..3a6f42ee 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap, Lww}; +use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::error::*; @@ -27,11 +27,11 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, - - //This attribute is only used to retain the previously computed partition size, - //to know to what extent does it change with the layout update. - pub partition_size: u32, - pub parameters: LayoutParameters, + + //This attribute is only used to retain the previously computed partition size, + //to know to what extent does it change with the layout update. + pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -39,7 +39,7 @@ pub struct ClusterLayout { /// in the system (this includes gateway nodes). /// The order here is different than the vec stored by `roles`, because: /// 1. non-gateway nodes are first so that they have lower numbers holding - /// in u8 (the number of non-gateway nodes is at most 256). + /// in u8 (the number of non-gateway nodes is at most 256). /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, @@ -49,7 +49,7 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - pub staged_parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } @@ -58,14 +58,14 @@ pub struct ClusterLayout { ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { - pub zone_redundancy:usize, + pub zone_redundancy: usize, } impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -96,27 +96,28 @@ impl NodeRole { } } - pub fn tags_string(&self) -> String { - let mut tags = String::new(); - if self.tags.is_empty() { - return tags - } - tags.push_str(&self.tags[0].clone()); - for t in 1..self.tags.len(){ - tags.push(','); - tags.push_str(&self.tags[t].clone()); - } - tags - } + pub fn tags_string(&self) -> String { + let mut tags = String::new(); + if self.tags.is_empty() { + return tags; + } + tags.push_str(&self.tags[0].clone()); + for t in 1..self.tags.len() { + tags.push(','); + tags.push_str(&self.tags[t].clone()); + } + tags + } } impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - - //We set the default zone redundancy to be equal to the replication factor, - //i.e. as strict as possible. - let parameters = LayoutParameters{ zone_redundancy: replication_factor}; - let staged_parameters = Lww::::new(parameters.clone()); + //We set the default zone redundancy to be equal to the replication factor, + //i.e. as strict as possible. + let parameters = LayoutParameters { + zone_redundancy: replication_factor, + }; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -124,12 +125,12 @@ impl ClusterLayout { ClusterLayout { version: 0, replication_factor, - partition_size: 0, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters, - staged_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -142,11 +143,10 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); - self.staged_parameters.merge(&other.staged_parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); - let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); let stage_changed = new_staging_hash != self.staging_hash; @@ -158,7 +158,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self,Message), Error> { + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { match version { None => { let error = r#" @@ -177,14 +177,14 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging); self.roles.retain(|(_, _, v)| v.0.is_some()); - let msg = self.calculate_partition_assignation()?; + let msg = self.calculate_partition_assignation()?; self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); self.version += 1; - Ok((self,msg)) + Ok((self, msg)) } pub fn revert_staged_changes(mut self, version: Option) -> Result { @@ -229,44 +229,52 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn useful_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity != None => result.push(*uuid), - _ => () - } - } - result - } - - ///Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid : &Uuid) -> Result { - match self.node_role(uuid) { - Some(role) => Ok(role.zone.clone()), - _ => Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) - } - } - - ///Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { - match self.node_role(uuid) { - Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => Ok(*cap), - _ => Err(Error::Message("The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity.".into())) - } - } - - ///Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { - let mut total_capacity = 0; - for uuid in self.useful_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; - } - Ok(total_capacity) - } + ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn useful_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => (), + } + } + result + } + + ///Given a node uuids, this function returns the label of its zone + pub fn get_node_zone(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + ///Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + ///Returns the sum of capacities of non gateway nodes in the cluster + pub fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.useful_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } /// Check a cluster layout for internal consistency /// returns true if consistent, false if error @@ -311,580 +319,689 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that every partition is associated to distinct nodes - let rf = self.replication_factor; - for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf*p..rf*(p+1)].to_vec(); - if nodes_of_p.iter().unique().count() != rf { - return false; - } - //Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p.iter() - .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.")); - let redundancy = self.parameters.zone_redundancy; - if zones_of_p.unique().count() < redundancy { - return false; - } - } - - //Check that the nodes capacities is consistent with the stored partitions - let mut node_usage = vec![0; MAX_NODE_NUMBER]; - for n in self.ring_assignation_data.iter() { - node_usage[*n as usize] += 1; - } - for (n, usage) in node_usage.iter().enumerate(){ - if *usage > 0 { - let uuid = self.node_id_vec[n]; - if usage*self.partition_size > self.get_node_capacity(&uuid) - .expect("Critical Error"){ - return false; - } - } - } - - //Check that the partition size stored is the one computed by the asignation - //algorithm. - let cl2 = self.clone(); - let (_ , zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); - let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); - if partition_size != self.partition_size { - return false; - } + //Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return false; + } + //Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p.iter().map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }); + let redundancy = self.parameters.zone_redundancy; + if zones_of_p.unique().count() < redundancy { + return false; + } + } + + //Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignation_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + if usage * self.partition_size + > self.get_node_capacity(&uuid).expect("Critical Error") + { + return false; + } + } + } + //Check that the partition size stored is the one computed by the asignation + //algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); + let partition_size = cl2 + .compute_optimal_partition_size(&zone_to_id) + .expect("Critical Error"); + if partition_size != self.partition_size { + return false; + } true } - } impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor - /// and the zone redundancy parameter It maximizes the capacity of a + /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - /// Staged changes must be merged with nodes roles before calling this function. - pub fn calculate_partition_assignation(&mut self) -> Result { + /// Staged changes must be merged with nodes roles before calling this function. + pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with the new ids - let old_assignation_opt = self.update_node_id_vec()?; - - let redundancy = self.staged_parameters.get().zone_redundancy; - - - let mut msg = Message::new(); - msg.push(format!("Computation of a new cluster layout where partitions are \ - replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); - - //We generate for once numerical ids for the zones of non gateway nodes, - //to use them as indices in the flow graphs. - let (id_to_zone , zone_to_id) = self.generate_useful_zone_ids()?; - - let nb_useful_nodes = self.useful_nodes().len(); - msg.push(format!("The cluster contains {} nodes spread over {} zones.", - nb_useful_nodes, id_to_zone.len())); - if nb_useful_nodes < self.replication_factor{ - return Err(Error::Message(format!("The number of nodes with positive \ + + //We update the node ids, since the node role list might have changed with the + //changes in the layout. We retrieve the old_assignation reframed with the new ids + let old_assignation_opt = self.update_node_id_vec()?; + + let redundancy = self.staged_parameters.get().zone_redundancy; + + let mut msg = Message::new(); + msg.push(format!( + "Computation of a new cluster layout where partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, redundancy + )); + + //We generate for once numerical ids for the zones of non gateway nodes, + //to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; + + let nb_useful_nodes = self.useful_nodes().len(); + msg.push(format!( + "The cluster contains {} nodes spread over {} zones.", + nb_useful_nodes, + id_to_zone.len() + )); + if nb_useful_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_useful_nodes, self.replication_factor))); - } - if id_to_zone.len() < redundancy { - return Err(Error::Message(format!("The number of zones with non-gateway \ + nb_useful_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", - id_to_zone.len() , redundancy))); - } - - //We compute the optimal partition size - //Capacities should be given in a unit so that partition size is at least 100. - //In this case, integer rounding plays a marginal role in the percentages of - //optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; - - if old_assignation_opt != None { - msg.push(format!("Given the replication and redundancy constraint, the \ + id_to_zone.len(), + redundancy + ))); + } + + //We compute the optimal partition size + //Capacities should be given in a unit so that partition size is at least 100. + //In this case, integer rounding plays a marginal role in the percentages of + //optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + + if old_assignation_opt != None { + msg.push(format!( + "Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {} (the zone redundancy was {}).", partition_size, self.partition_size, - self.parameters.zone_redundancy)); - } - else { - msg.push(format!("Given the replication and redundancy constraints, the \ - optimal size of a partition is {}.", partition_size)); - } - self.partition_size = partition_size; - self.parameters = self.staged_parameters.get().clone(); - - if partition_size < 100 { - msg.push("WARNING: The partition size is low (< 100), you might consider to \ - provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb).".into()); - } - - //We compute a first flow/assignment that is heuristically close to the previous - //assignment - let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; - } - - msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); - msg.push("".to_string()); - - //We update the layout structure - self.update_ring_from_flow(id_to_zone.len() , &gflow)?; - Ok(msg) - } + be {} (the zone redundancy was {}).", + partition_size, self.partition_size, self.parameters.zone_redundancy + )); + } else { + msg.push(format!( + "Given the replication and redundancy constraints, the \ + optimal size of a partition is {}.", + partition_size + )); + } + self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), you might consider to \ + provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb)." + .into(), + ); + } + + //We compute a first flow/assignment that is heuristically close to the previous + //assignment + let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignation_opt)?; + if let Some(assoc) = &old_assignation_opt { + //We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } + + msg.append(&mut self.output_stat( + &gflow, + &old_assignation_opt, + &zone_to_id, + &id_to_zone, + )?); + msg.push("".to_string()); + + //We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + Ok(msg) + } /// The LwwMap of node roles might have changed. This function updates the node_id_vec /// and returns the assignation given by ring, with the new indices of the nodes, and /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,Error> { - // (1) We compute the new node list - //Non gateway nodes should be coded on 8bits, hence they must be first in the list - //We build the new node ids - let mut new_non_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) - .map(|(k, _, _)| *k).collect(); - - if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER) )); - } - - let mut new_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) - .map(|(k, _, _)| *k).collect(); - - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.append(&mut new_non_gateway_nodes); - new_node_id_vec.append(&mut new_gateway_nodes); - - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); - - // (2) We retrieve the old association - //We rewrite the old association with the new indices. We only consider partition - //to node assignations where the node is still in use. - let nb_partitions = 1usize << PARTITION_BITS; - let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; - - if self.ring_assignation_data.is_empty() { - //This is a new association - return Ok(None); - } - if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err(Error::Message("The old assignation does not have a size corresponding to \ - the old replication factor or the number of partitions.".into())); - } - - //We build a translation table between the uuid and new ids - let mut uuid_to_new_id = HashMap::::new(); - - //We add the indices of only the new non-gateway nodes that can be used in the - //association ring - for (i, uuid) in new_node_id_vec.iter().enumerate() { - uuid_to_new_id.insert(*uuid, i ); - } - - let rf= self.replication_factor; - for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { - for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { - let uuid = old_node_id_vec[*old_id as usize]; - if uuid_to_new_id.contains_key(&uuid) { - old_assign_p.push(uuid_to_new_id[&uuid]); - } - } - } - - //We write the ring - self.ring_assignation_data = Vec::::new(); - - if !self.check() { - return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); - } - - Ok(Some(old_assignation)) + fn update_node_id_vec(&mut self) -> Result>>, Error> { + // (1) We compute the new node list + //Non gateway nodes should be coded on 8bits, hence they must be first in the list + //We build the new node ids + let mut new_non_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); + } + + let mut new_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.append(&mut new_non_gateway_nodes); + new_node_id_vec.append(&mut new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + //We rewrite the old association with the new indices. We only consider partition + //to node assignations where the node is still in use. + let nb_partitions = 1usize << PARTITION_BITS; + let mut old_assignation = vec![Vec::::new(); nb_partitions]; + + if self.ring_assignation_data.is_empty() { + //This is a new association + return Ok(None); + } + if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + return Err(Error::Message( + "The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + //We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + //We add the indices of only the new non-gateway nodes that can be used in the + //association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); + } + + let rf = self.replication_factor; + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { + for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } + + //We write the ring + self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message( + "Critical error: The computed layout happens to be incorrect".into(), + )); + } + + Ok(Some(old_assignation)) } + ///This function generates ids for the zone of the nodes appearing in + ///self.node_id_vec. + fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.useful_nodes().iter() { + if self.roles.get(uuid) == None { + return Err(Error::Message( + "The uuid was not found in the node roles (this should \ + not happen, it might be a critical error)." + .into(), + )); + } + if let Some(r) = self.node_role(uuid) { + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + } + Ok((id_to_zone, zone_to_id)) + } - ///This function generates ids for the zone of the nodes appearing in - ///self.node_id_vec. - fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ - let mut id_to_zone = Vec::::new(); - let mut zone_to_id = HashMap::::new(); - - for uuid in self.useful_nodes().iter() { - if self.roles.get(uuid) == None { - return Err(Error::Message("The uuid was not found in the node roles (this should \ - not happen, it might be a critical error).".into())); - } - if let Some(r) = self.node_role(uuid) { - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone() , id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - } - } - Ok((id_to_zone, zone_to_id)) - } - - ///This function computes by dichotomy the largest realizable partition size, given - ///the layout. - fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ - let nb_partitions = 1usize << PARTITION_BITS; - let empty_set = HashSet::<(usize,usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err(Error::Message("The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1.".into())); - } - - let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; - while s_down +1 < s_up { - g = self.generate_flow_graph((s_down+s_up)/2, zone_to_id, &empty_set)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - s_up = (s_down+s_up)/2; - } - else { - s_down = (s_down+s_up)/2; - } - } - - Ok(s_down) - } - - fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { - let mut vertices = vec![Vertex::Source, Vertex::Sink]; - for p in 0..NB_PARTITIONS { - vertices.push(Vertex::Pup(p)); - vertices.push(Vertex::Pdown(p)); - for z in 0..nb_zones { - vertices.push(Vertex::PZ(p, z)); - } - } - for n in 0..nb_nodes { - vertices.push(Vertex::N(n)); - } - vertices - } - - fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { - let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), - self.useful_nodes().len()); - let mut g= Graph::::new(&vertices); - let nb_zones = zone_to_id.len(); - let redundancy = self.staged_parameters.get().zone_redundancy; - for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; - g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; - for z in 0..nb_zones { - g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; - g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , - self.replication_factor as u32)?; - } - } - for n in 0..self.useful_nodes().len() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity/size)?; - for p in 0..NB_PARTITIONS { - if !exclude_assoc.contains(&(p,n)) { - g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; - } - } - } - Ok(g) - } - - - fn compute_candidate_assignment(&self, zone_to_id: &HashMap, - old_assoc_opt : &Option >>) -> Result, Error > { - - //We list the edges that are not used in the old association - let mut exclude_edge = HashSet::<(usize,usize)>::new(); - if let Some(old_assoc) = old_assoc_opt { - let nb_nodes = self.useful_nodes().len(); - for (p, old_assoc_p) in old_assoc.iter().enumerate() { - for n in 0..nb_nodes { - exclude_edge.insert((p,n)); - } - for n in old_assoc_p.iter() { - exclude_edge.remove(&(p,*n)); - } - } - } - - //We compute the best flow using only the edges used in the old assoc - let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge )?; - g.compute_maximal_flow()?; - for (p,n) in exclude_edge.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; - g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; - } - g.compute_maximal_flow()?; - Ok(g) - } - - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &[Vec ]) -> Result<(), Error > { - let mut cost = CostFunction::new(); - for (p, assoc_p) in old_assoc.iter().enumerate(){ - for n in assoc_p.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; - cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); - } - } - let nb_nodes = self.useful_nodes().len(); - let path_length = 4*nb_nodes; - gflow.optimize_flow_with_cost(&cost, path_length)?; - - Ok(()) - } - - fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ - self.ring_assignation_data = Vec::::new(); - for p in 0..NB_PARTITIONS { - for z in 0..nb_zones { - let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - for vertex in assoc_vertex.iter() { - if let Vertex::N(n) = vertex { - self.ring_assignation_data.push((*n).try_into().unwrap()); - } - } - } - } - - if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err(Error::Message("Critical Error : the association ring we produced does not \ - have the right size.".into())); - } - Ok(()) - } - - - //This function returns a message summing up the partition repartition of the new - //layout. - fn output_stat(&self , gflow : &Graph, - old_assoc_opt : &Option< Vec> >, - zone_to_id: &HashMap, - id_to_zone : &[String]) -> Result{ - let mut msg = Message::new(); - + ///This function computes by dichotomy the largest realizable partition size, given + ///the layout. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap, + ) -> Result { let nb_partitions = 1usize << PARTITION_BITS; - let used_cap = self.partition_size * nb_partitions as u32 * - self.replication_factor as u32; - let total_cap = self.get_total_capacity()?; - let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); - msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", - used_cap , total_cap , percent_cap )); - msg.push("".into()); - msg.push("If the percentage is to low, it might be that the \ + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? + < (nb_partitions * self.replication_factor) + .try_into() + .unwrap() + { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? + < (nb_partitions * self.replication_factor) + .try_into() + .unwrap() + { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } + + Ok(s_down) + } + + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } + + fn generate_flow_graph( + &self, + size: u32, + zone_to_id: &HashMap, + exclude_assoc: &HashSet<(usize, usize)>, + ) -> Result, Error> { + let vertices = + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); + let mut g = Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + let redundancy = self.staged_parameters.get().zone_redundancy; + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - redundancy) as u32, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u32, + )?; + } + } + for n in 0..self.useful_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + Ok(g) + } + + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap, + old_assoc_opt: &Option>>, + ) -> Result, Error> { + //We list the edges that are not used in the old association + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(old_assoc) = old_assoc_opt { + let nb_nodes = self.useful_nodes().len(); + for (p, old_assoc_p) in old_assoc.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in old_assoc_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } + + //We compute the best flow using only the edges used in the old assoc + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + g.compute_maximal_flow()?; + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) + } + + fn minimize_rebalance_load( + &self, + gflow: &mut Graph, + zone_to_id: &HashMap, + old_assoc: &[Vec], + ) -> Result<(), Error> { + let mut cost = CostFunction::new(); + for (p, assoc_p) in old_assoc.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); + } + } + let nb_nodes = self.useful_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + Ok(()) + } + + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph, + ) -> Result<(), Error> { + self.ring_assignation_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignation_data.push((*n).try_into().unwrap()); + } + } + } + } + + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) + } + + //This function returns a message summing up the partition repartition of the new + //layout. + fn output_stat( + &self, + gflow: &Graph, + old_assoc_opt: &Option>>, + zone_to_id: &HashMap, + id_to_zone: &[String], + ) -> Result { + let mut msg = Message::new(); + + let nb_partitions = 1usize << PARTITION_BITS; + let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push(format!( + "Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + used_cap, total_cap, percent_cap + )); + msg.push("".into()); + msg.push( + "If the percentage is to low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones.".into()); - msg.push(format!("Recall that because of the replication factor, the actual available \ - storage capacity is {} / {} = {}.", - used_cap , self.replication_factor , - used_cap/self.replication_factor as u32)); - - //We define and fill in the following tables - let storing_nodes = self.useful_nodes(); - let mut new_partitions = vec![0; storing_nodes.len()]; - let mut stored_partitions = vec![0; storing_nodes.len()]; - - let mut new_partitions_zone = vec![0; id_to_zone.len()]; - let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - - for p in 0..nb_partitions { - for z in 0..id_to_zone.len() { - let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - if !pz_nodes.is_empty() { - stored_partitions_zone[z] += 1; - if let Some(old_assoc) = old_assoc_opt { - let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { - old_zones_of_p.push( - zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } - } - for vert in pz_nodes.iter() { - if let Vertex::N(n) = *vert { - stored_partitions[n] += 1; - if let Some(old_assoc) = old_assoc_opt { - if !old_assoc[p].contains(&n) { - new_partitions[n] += 1; - } - } - } - } - } - } - - if *old_assoc_opt == None { - new_partitions = stored_partitions.clone(); - new_partitions_zone = stored_partitions_zone.clone(); - } - - //We display the statistics - - msg.push("".into()); - if *old_assoc_opt != None { - let total_new_partitions : usize = new_partitions.iter().sum(); - msg.push(format!("A total of {} new copies of partitions need to be \ - transferred.", total_new_partitions)); - } - msg.push("".into()); - msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); - - for z in 0..id_to_zone.len(){ - let mut nodes_of_z = Vec::::new(); - for n in 0..storing_nodes.len(){ - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { - nodes_of_z.push(n); - } - } - let replicated_partitions : usize = nodes_of_z.iter() - .map(|n| stored_partitions[*n]).sum(); - msg.push("".into()); - - msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ - {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], - new_partitions_zone[z], replicated_partitions)); - - let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; - let mut total_cap_z = 0; - for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; - } - let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); - msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", - available_cap_z, total_cap_z, percent_cap_z)); - - for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; - let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self.node_role(&self.node_id_vec[*n]) - .ok_or("Node not found."))?.tags_string(); - msg.push(format!(" Node {}: {} partitions ({} new) ; \ - available/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec()[0..2].to_vec().encode_hex::(), - stored_partitions[*n], - new_partitions[*n], available_cap_n, total_cap_n, - (available_cap_n as f32)/(total_cap_n as f32)*100.0 , - tags_n)); - } - } - - Ok(msg) - } - + See the detailed statistics below and look for saturated nodes/zones." + .into(), + ); + msg.push(format!( + "Recall that because of the replication factor, the actual available \ + storage capacity is {} / {} = {}.", + used_cap, + self.replication_factor, + used_cap / self.replication_factor as u32 + )); + + //We define and fill in the following tables + let storing_nodes = self.useful_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..nb_partitions { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p + .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(old_assoc) = old_assoc_opt { + if !old_assoc[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if *old_assoc_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } + + //We display the statistics + + msg.push("".into()); + if *old_assoc_opt != None { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + } + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); + + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + msg.push("".into()); + + msg.push(format!( + "Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", + id_to_zone[z], + stored_partitions_zone[z], + new_partitions_zone[z], + replicated_partitions + )); + + let available_cap_z: u32 = self.partition_size * replicated_partitions as u32; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + msg.push(format!( + " Available capacity / Total capacity: {}/{} ({:.1}%).", + available_cap_z, total_cap_z, percent_cap_z + )); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u32 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self + .node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))? + .tags_string(); + msg.push(format!( + " Node {}: {} partitions ({} new) ; \ + available/total capacity: {} / {} ({:.1}%) ; tags:{}", + &self.node_id_vec[*n].to_vec()[0..2] + .to_vec() + .encode_hex::(), + stored_partitions[*n], + new_partitions[*n], + available_cap_n, + total_cap_n, + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + tags_n + )); + } + } + + Ok(msg) + } } //==================================================================================== #[cfg(test)] mod tests { - use super::{*,Error}; - use std::cmp::min; - - - //This function checks that the partition size S computed is at least better than the - //one given by a very naive algorithm. To do so, we try to run the naive algorithm - //assuming a partion size of S+1. If we succed, it means that the optimal assignation - //was not optimal. The naive algorithm is the following : - //- we compute the max number of partitions associated to every node, capped at the - //partition number. It gives the number of tokens of every node. - //- every zone has a number of tokens equal to the sum of the tokens of its nodes. - //- we cycle over the partitions and associate zone tokens while respecting the - //zone redundancy constraint. - //NOTE: the naive algorithm is not optimal. Counter example: - //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - //With these parameters, the naive algo fails, whereas there is a solution: - //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &ClusterLayout) -> Result { - let over_size = cl.partition_size +1; - let mut zone_token = HashMap::::new(); - let nb_partitions = 1usize << PARTITION_BITS; - - let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.useful_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; - zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); - } - - //For every partition, we count the number of zone already associated and - //the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z,t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; nb_partitions]; - let mut last_zone = vec![zones.len(); nb_partitions]; - - let mut curr_zone = 0; - - let redundancy = cl.parameters.zone_redundancy; - - for replic in 0..cl.replication_factor { - for p in 0..nb_partitions { - while id_zone_token[curr_zone] == 0 || - (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg : &Message) { - for s in msg.iter(){ - println!("{}",s); - } - } + use super::{Error, *}; + use std::cmp::min; + + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(nb_partitions, (c / over_size) as usize), + ); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } + + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } + } fn update_layout( cl: &mut ClusterLayout, node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, - zone_redundancy: usize + zone_redundancy: usize, ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -901,12 +1018,12 @@ mod tests { ); cl.roles.merge(&update); } - cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); + cl.staged_parameters = Lww::::new(LayoutParameters { zone_redundancy }); } #[test] fn test_assignation() { - let mut node_id_vec = vec![1, 2, 3]; + let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] .into_iter() @@ -936,11 +1053,12 @@ mod tests { assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); - node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); - } } -- cgit v1.2.3 From e5664c9822c6ed1ecb30cac41b6a4125da3f88e7 Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 11 Oct 2022 17:17:13 +0200 Subject: Improved the statistics displayed in layout show corrected a few bugs --- src/rpc/layout.rs | 105 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 39 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 3a6f42ee..d2ed8af8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -205,6 +205,7 @@ To know the correct value of the new layout version, invoke `garage layout show` self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + self.staged_parameters.update(self.parameters.clone()); self.version += 1; @@ -267,6 +268,26 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + ///Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignation_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + ///Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; @@ -357,11 +378,10 @@ To know the correct value of the new layout version, invoke `garage layout show` //algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); - let partition_size = cl2 - .compute_optimal_partition_size(&zone_to_id) - .expect("Critical Error"); - if partition_size != self.partition_size { - return false; + match cl2.compute_optimal_partition_size(&zone_to_id) { + Ok(s) if s != self.partition_size => return false, + Err(_) => return false, + _ => (), } true @@ -376,8 +396,9 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - /// Staged changes must be merged with nodes roles before calling this function. - pub fn calculate_partition_assignation(&mut self) -> Result { + // Staged role changes must be merged with nodes roles before calling this function, + // hence it must only be called from apply_staged_changes() and it is not public. + fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. @@ -386,13 +407,15 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.staged_parameters.get().zone_redundancy; + self.parameters = self.staged_parameters.get().clone(); let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); msg.push(format!( - "Computation of a new cluster layout where partitions are \ + "Partitions are \ replicated {} times on at least {} distinct zones.", - self.replication_factor, redundancy + self.replication_factor, self.parameters.zone_redundancy )); //We generate for once numerical ids for the zones of non gateway nodes, @@ -400,11 +423,6 @@ impl ClusterLayout { let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; let nb_useful_nodes = self.useful_nodes().len(); - msg.push(format!( - "The cluster contains {} nodes spread over {} zones.", - nb_useful_nodes, - id_to_zone.len() - )); if nb_useful_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ @@ -412,12 +430,12 @@ impl ClusterLayout { nb_useful_nodes, self.replication_factor ))); } - if id_to_zone.len() < redundancy { + if id_to_zone.len() < self.parameters.zone_redundancy { return Err(Error::Message(format!( "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", id_to_zone.len(), - redundancy + self.parameters.zone_redundancy ))); } @@ -429,10 +447,8 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!( - "Given the replication and redundancy constraint, the \ - optimal size of a partition is {}. In the previous layout, it used to \ - be {} (the zone redundancy was {}).", - partition_size, self.partition_size, self.parameters.zone_redundancy + "Optimal size of a partition: {} (was {} in the previous layout).", + partition_size, self.partition_size )); } else { msg.push(format!( @@ -442,7 +458,6 @@ impl ClusterLayout { )); } self.partition_size = partition_size; - self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push( @@ -470,6 +485,13 @@ impl ClusterLayout { //We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + + if !self.check() { + return Err(Error::Message( + "Critical error: The computed layout happens to be incorrect".into(), + )); + } + Ok(msg) } @@ -553,12 +575,6 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); - if !self.check() { - return Err(Error::Message( - "Critical error: The computed layout happens to be incorrect".into(), - )); - } - Ok(Some(old_assignation)) } @@ -652,7 +668,7 @@ impl ClusterLayout { ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.staged_parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge( @@ -774,8 +790,9 @@ impl ClusterLayout { let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push("".into()); msg.push(format!( - "Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap, total_cap, percent_cap )); msg.push("".into()); @@ -878,7 +895,7 @@ impl ClusterLayout { } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); msg.push(format!( - " Available capacity / Total capacity: {}/{} ({:.1}%).", + " Usable capacity / Total capacity: {}/{} ({:.1}%).", available_cap_z, total_cap_z, percent_cap_z )); @@ -891,7 +908,7 @@ impl ClusterLayout { .tags_string(); msg.push(format!( " Node {}: {} partitions ({} new) ; \ - available/total capacity: {} / {} ({:.1}%) ; tags:{}", + usable/total capacity: {} / {} ({:.1}%) ; tags:{}", &self.node_id_vec[*n].to_vec()[0..2] .to_vec() .encode_hex::(), @@ -1008,7 +1025,7 @@ mod tests { cl.node_id_vec.push(x); } - let update = cl.roles.update_mutator( + let update = cl.staging.update_mutator( cl.node_id_vec[i], NodeRoleV(Some(NodeRole { zone: (node_zone_vec[i].to_string()), @@ -1016,9 +1033,11 @@ mod tests { tags: (vec![]), })), ); - cl.roles.merge(&update); + cl.staging.merge(&update); } - cl.staged_parameters = Lww::::new(LayoutParameters { zone_redundancy }); + cl.staging_hash = blake2sum(&rmp_to_vec_all_named(&cl.staging).unwrap()[..]); + cl.staged_parameters + .update(LayoutParameters { zone_redundancy }); } #[test] @@ -1032,7 +1051,9 @@ mod tests { let mut cl = ClusterLayout::new(3); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); @@ -1043,13 +1064,17 @@ mod tests { .map(|x| x.to_string()) .collect(); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); @@ -1057,7 +1082,9 @@ mod tests { 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, ]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); } -- cgit v1.2.3 From bcdd1e0c3335500a6d0337ce6ee050fb59fc665a Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 11 Oct 2022 18:29:21 +0200 Subject: Added some comment --- src/rpc/layout.rs | 162 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 93 insertions(+), 69 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d2ed8af8..38e56b88 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -17,6 +17,8 @@ use crate::ring::*; use std::convert::TryInto; +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + //The Message type will be used to collect information on the algorithm. type Message = Vec; @@ -28,9 +30,11 @@ pub struct ClusterLayout { pub replication_factor: usize, - //This attribute is only used to retain the previously computed partition size, - //to know to what extent does it change with the layout update. + ///This attribute is only used to retain the previously computed partition size, + ///to know to what extent does it change with the layout update. pub partition_size: u32, + ///Parameters used to compute the assignation currently given by + ///ring_assignation_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -48,8 +52,9 @@ pub struct ClusterLayout { #[serde(with = "serde_bytes")] pub ring_assignation_data: Vec, - /// Role changes which are staged for the next version of the layout + /// Parameters to be used in the next partition assignation computation. pub staged_parameters: Lww, + /// Role changes which are staged for the next version of the layout pub staging: LwwMap, pub staging_hash: Hash, } @@ -65,8 +70,6 @@ impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -77,12 +80,13 @@ impl AutoCrdt for NodeRoleV { /// The user-assigned roles of cluster nodes #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRole { - /// Datacenter at which this entry belong. This information might be used to perform a better - /// geodistribution + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution pub zone: String, - /// The (relative) capacity of the node + /// The capacity of the node /// If this is set to None, the node does not participate in storing data for the system /// and is only active as an API gateway to other nodes + // TODO : change the capacity to u64 and use byte unit input/output pub capacity: Option, /// A set of tags to recognize the node pub tags: Vec, @@ -110,6 +114,7 @@ impl NodeRole { } } +//Implementation of the ClusterLayout methods unrelated to the assignation algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { //We set the default zone redundancy to be equal to the replication factor, @@ -231,7 +236,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } ///Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn useful_nodes(&self) -> Vec { + pub fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { @@ -291,13 +296,14 @@ To know the correct value of the new layout version, invoke `garage layout show` ///Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; - for uuid in self.useful_nodes().iter() { + for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; } Ok(total_capacity) } /// Check a cluster layout for internal consistency + /// (assignation, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> bool { // Check that the hash of the staging data is correct @@ -377,7 +383,7 @@ To know the correct value of the new layout version, invoke `garage layout show` //Check that the partition size stored is the one computed by the asignation //algorithm. let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); match cl2.compute_optimal_partition_size(&zone_to_id) { Ok(s) if s != self.partition_size => return false, Err(_) => return false, @@ -388,6 +394,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } +//Implementation of the ClusterLayout methods related to the assignation algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor @@ -397,16 +404,13 @@ impl ClusterLayout { /// the former assignation (if any) to minimize the amount of /// data to be moved. // Staged role changes must be merged with nodes roles before calling this function, - // hence it must only be called from apply_staged_changes() and it is not public. + // hence it must only be called from apply_staged_changes() and hence is not public. fn calculate_partition_assignation(&mut self) -> Result { - //The nodes might have been updated, some might have been deleted. - //So we need to first update the list of nodes and retrieve the - //assignation. - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with the new ids + //changes in the layout. We retrieve the old_assignation reframed with new ids let old_assignation_opt = self.update_node_id_vec()?; + //We update the parameters self.parameters = self.staged_parameters.get().clone(); let mut msg = Message::new(); @@ -420,14 +424,14 @@ impl ClusterLayout { //We generate for once numerical ids for the zones of non gateway nodes, //to use them as indices in the flow graphs. - let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_useful_nodes = self.useful_nodes().len(); - if nb_useful_nodes < self.replication_factor { + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_useful_nodes, self.replication_factor + nb_nongateway_nodes, self.replication_factor ))); } if id_to_zone.len() < self.parameters.zone_redundancy { @@ -457,6 +461,7 @@ impl ClusterLayout { partition_size )); } + //We write the partition size. self.partition_size = partition_size; if partition_size < 100 { @@ -467,14 +472,15 @@ impl ClusterLayout { ); } - //We compute a first flow/assignment that is heuristically close to the previous - //assignment - let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignation_opt)?; + //We compute a first flow/assignation that is heuristically close to the previous + //assignation + let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignment. + //We minimize the distance to the previous assignation. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } + //We display statistics of the computation msg.append(&mut self.output_stat( &gflow, &old_assignation_opt, @@ -538,14 +544,13 @@ impl ClusterLayout { // (2) We retrieve the old association //We rewrite the old association with the new indices. We only consider partition //to node assignations where the node is still in use. - let nb_partitions = 1usize << PARTITION_BITS; - let mut old_assignation = vec![Vec::::new(); nb_partitions]; + let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; if self.ring_assignation_data.is_empty() { //This is a new association return Ok(None); } - if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "The old assignation does not have a size corresponding to \ the old replication factor or the number of partitions." @@ -580,11 +585,11 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - for uuid in self.useful_nodes().iter() { + for uuid in self.nongateway_nodes().iter() { if self.roles.get(uuid) == None { return Err(Error::Message( "The uuid was not found in the node roles (this should \ @@ -603,17 +608,16 @@ impl ClusterLayout { } ///This function computes by dichotomy the largest realizable partition size, given - ///the layout. + ///the layout roles and parameters. fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, ) -> Result { - let nb_partitions = 1usize << PARTITION_BITS; let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? - < (nb_partitions * self.replication_factor) + < (NB_PARTITIONS * self.replication_factor) .try_into() .unwrap() { @@ -630,7 +634,7 @@ impl ClusterLayout { g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? - < (nb_partitions * self.replication_factor) + < (NB_PARTITIONS * self.replication_factor) .try_into() .unwrap() { @@ -658,14 +662,21 @@ impl ClusterLayout { vertices } + ///Generates the graph to compute the maximal flow corresponding to the optimal + ///partition assignation. + ///exclude_assoc is the set of (partition, node) association that we are forbidden + ///to use (hence we do not add the corresponding edge to the graph). This parameter + ///is used to compute a first flow that uses only edges appearing in the previous + ///assignation. This produces a solution that heuristically should be close to the + ///previous one. fn generate_flow_graph( &self, - size: u32, + partition_size: u32, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, ) -> Result, Error> { let vertices = - ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); let redundancy = self.parameters.zone_redundancy; @@ -685,10 +696,10 @@ impl ClusterLayout { )?; } } - for n in 0..self.useful_nodes().len() { + for n in 0..self.nongateway_nodes().len() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / size)?; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; @@ -698,28 +709,34 @@ impl ClusterLayout { Ok(g) } - fn compute_candidate_assignment( + ///This function computes a first optimal assignation (in the form of a flow graph). + fn compute_candidate_assignation( &self, zone_to_id: &HashMap, - old_assoc_opt: &Option>>, + prev_assign_opt: &Option>>, ) -> Result, Error> { - //We list the edges that are not used in the old association + //We list the (partition,node) associations that are not used in the + //previous assignation let mut exclude_edge = HashSet::<(usize, usize)>::new(); - if let Some(old_assoc) = old_assoc_opt { - let nb_nodes = self.useful_nodes().len(); - for (p, old_assoc_p) in old_assoc.iter().enumerate() { + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); } - for n in old_assoc_p.iter() { + for n in prev_assign_p.iter() { exclude_edge.remove(&(p, *n)); } } } - //We compute the best flow using only the edges used in the old assoc + //We compute the best flow using only the edges used in the previous assignation let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; + + //We add the excluded edges and compute the maximal flow with the full graph. + //The algorithm is such that it will start with the flow that we just computed + //and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; @@ -728,26 +745,35 @@ impl ClusterLayout { Ok(g) } + ///This function updates the flow graph gflow to minimize the distance between + ///its corresponding assignation and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, zone_to_id: &HashMap, - old_assoc: &[Vec], + prev_assign: &[Vec], ) -> Result<(), Error> { + //We define a cost function on the edges (pairs of vertices) corresponding + //to the distance between the two assignations. let mut cost = CostFunction::new(); - for (p, assoc_p) in old_assoc.iter().enumerate() { + for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } - let nb_nodes = self.useful_nodes().len(); + + //We compute the maximal length of a simple path in gflow. It is used in the + //Bellman-Ford algorithm in optimize_flow_with_cost to set the number + //of iterations. + let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; Ok(()) } + ///This function updates the assignation ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, @@ -775,19 +801,18 @@ impl ClusterLayout { Ok(()) } - //This function returns a message summing up the partition repartition of the new - //layout. + ///This function returns a message summing up the partition repartition of the new + ///layout, and other statistics of the partition assignation computation. fn output_stat( &self, gflow: &Graph, - old_assoc_opt: &Option>>, + prev_assign_opt: &Option>>, zone_to_id: &HashMap, id_to_zone: &[String], ) -> Result { let mut msg = Message::new(); - let nb_partitions = 1usize << PARTITION_BITS; - let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; + let used_cap = self.partition_size * NB_PARTITIONS as u32 * self.replication_factor as u32; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push("".into()); @@ -813,21 +838,21 @@ impl ClusterLayout { )); //We define and fill in the following tables - let storing_nodes = self.useful_nodes(); + let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; let mut new_partitions_zone = vec![0; id_to_zone.len()]; let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - for p in 0..nb_partitions { + for p in 0..NB_PARTITIONS { for z in 0..id_to_zone.len() { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; if !pz_nodes.is_empty() { stored_partitions_zone[z] += 1; - if let Some(old_assoc) = old_assoc_opt { + if let Some(prev_assign) = prev_assign_opt { let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { + for n in prev_assign[p].iter() { old_zones_of_p .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); } @@ -839,8 +864,8 @@ impl ClusterLayout { for vert in pz_nodes.iter() { if let Vertex::N(n) = *vert { stored_partitions[n] += 1; - if let Some(old_assoc) = old_assoc_opt { - if !old_assoc[p].contains(&n) { + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { new_partitions[n] += 1; } } @@ -849,7 +874,7 @@ impl ClusterLayout { } } - if *old_assoc_opt == None { + if *prev_assign_opt == None { new_partitions = stored_partitions.clone(); new_partitions_zone = stored_partitions_zone.clone(); } @@ -857,7 +882,7 @@ impl ClusterLayout { //We display the statistics msg.push("".into()); - if *old_assoc_opt != None { + if *prev_assign_opt != None { let total_new_partitions: usize = new_partitions.iter().sum(); msg.push(format!( "A total of {} new copies of partitions need to be \ @@ -950,9 +975,8 @@ mod tests { fn check_against_naive(cl: &ClusterLayout) -> Result { let over_size = cl.partition_size + 1; let mut zone_token = HashMap::::new(); - let nb_partitions = 1usize << PARTITION_BITS; - let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; if zones.is_empty() { return Ok(false); @@ -961,12 +985,12 @@ mod tests { for z in zones.iter() { zone_token.insert(z.clone(), 0); } - for uuid in cl.useful_nodes().iter() { + for uuid in cl.nongateway_nodes().iter() { let z = cl.get_node_zone(uuid)?; let c = cl.get_node_capacity(uuid)?; zone_token.insert( z.clone(), - zone_token[&z] + min(nb_partitions, (c / over_size) as usize), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), ); } @@ -978,15 +1002,15 @@ mod tests { id_zone_token[zone_to_id[z]] = *t; } - let mut nb_token = vec![0; nb_partitions]; - let mut last_zone = vec![zones.len(); nb_partitions]; + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; let mut curr_zone = 0; let redundancy = cl.parameters.zone_redundancy; for replic in 0..cl.replication_factor { - for p in 0..nb_partitions { + for p in 0..NB_PARTITIONS { while id_zone_token[curr_zone] == 0 || (last_zone[p] == curr_zone && redundancy - nb_token[p] <= cl.replication_factor - replic) -- cgit v1.2.3 From ea5afc251106b3f6e2d07f942ba1f88abeef8765 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 19:34:40 +0100 Subject: Style improvements --- src/rpc/layout.rs | 247 +++++++++++++++++++++++++++--------------------------- 1 file changed, 125 insertions(+), 122 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 38e56b88..95f69dc8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -19,7 +19,7 @@ use std::convert::TryInto; const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; -//The Message type will be used to collect information on the algorithm. +// The Message type will be used to collect information on the algorithm. type Message = Vec; /// The layout of the cluster, i.e. the list of roles @@ -30,11 +30,11 @@ pub struct ClusterLayout { pub replication_factor: usize, - ///This attribute is only used to retain the previously computed partition size, - ///to know to what extent does it change with the layout update. + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. pub partition_size: u32, - ///Parameters used to compute the assignation currently given by - ///ring_assignation_data + /// Parameters used to compute the assignation currently given by + /// ring_assignation_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -53,14 +53,14 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Parameters to be used in the next partition assignation computation. - pub staged_parameters: Lww, + pub staging_parameters: Lww, /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, + pub staging_roles: LwwMap, pub staging_hash: Hash, } -///This struct is used to set the parameters to be used in the assignation computation -///algorithm. It is stored as a Crdt. +/// This struct is used to set the parameters to be used in the assignation computation +/// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { pub zone_redundancy: usize, @@ -114,20 +114,19 @@ impl NodeRole { } } -//Implementation of the ClusterLayout methods unrelated to the assignation algorithm. +// Implementation of the ClusterLayout methods unrelated to the assignation algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - //We set the default zone redundancy to be equal to the replication factor, - //i.e. as strict as possible. + // We set the default zone redundancy to be equal to the replication factor, + // i.e. as strict as possible. let parameters = LayoutParameters { zone_redundancy: replication_factor, }; - let staged_parameters = Lww::::new(parameters.clone()); + let staging_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); - let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); - ClusterLayout { + let mut ret = ClusterLayout { version: 0, replication_factor, partition_size: 0, @@ -135,10 +134,17 @@ impl ClusterLayout { node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), parameters, - staged_parameters, - staging: empty_lwwmap, - staging_hash: empty_lwwmap_hash, - } + staging_parameters, + staging_roles: empty_lwwmap, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + + fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&rmp_to_vec_all_named(&hashed_tuple).unwrap()[..]) } pub fn merge(&mut self, other: &ClusterLayout) -> bool { @@ -148,16 +154,15 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); - self.staged_parameters.merge(&other.staged_parameters); - self.staging.merge(&other.staging); + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); - let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - let stage_changed = new_staging_hash != self.staging_hash; + let new_staging_hash = self.calculate_staging_hash(); + let changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; - stage_changed || param_changed + changed } Ordering::Less => false, } @@ -179,13 +184,14 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.roles.merge(&self.staging); + self.roles.merge(&self.staging_roles); self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = self.staging_parameters.get().clone(); let msg = self.calculate_partition_assignation()?; - self.staging.clear(); - self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -208,9 +214,9 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.staging.clear(); - self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - self.staged_parameters.update(self.parameters.clone()); + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + self.staging_parameters.update(self.parameters.clone()); self.version += 1; @@ -235,7 +241,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. pub fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { @@ -247,7 +253,7 @@ To know the correct value of the new layout version, invoke `garage layout show` result } - ///Given a node uuids, this function returns the label of its zone + /// Given a node uuids, this function returns the label of its zone pub fn get_node_zone(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(role) => Ok(role.zone.clone()), @@ -257,7 +263,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Given a node uuids, this function returns its capacity or fails if it does not have any + /// Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole { @@ -273,7 +279,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the number of partitions associated to this node in the ring + /// Returns the number of partitions associated to this node in the ring pub fn get_node_usage(&self, uuid: &Uuid) -> Result { for (i, id) in self.node_id_vec.iter().enumerate() { if id == uuid { @@ -293,7 +299,7 @@ To know the correct value of the new layout version, invoke `garage layout show` )) } - ///Returns the sum of capacities of non gateway nodes in the cluster + /// Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { @@ -307,7 +313,7 @@ To know the correct value of the new layout version, invoke `garage layout show` /// returns true if consistent, false if error pub fn check(&self) -> bool { // Check that the hash of the staging data is correct - let staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { return false; } @@ -346,14 +352,14 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that every partition is associated to distinct nodes + // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); if nodes_of_p.iter().unique().count() != rf { return false; } - //Check that every partition is spread over at least zone_redundancy zones. + // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p.iter().map(|n| { self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.") @@ -364,7 +370,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that the nodes capacities is consistent with the stored partitions + // Check that the nodes capacities is consistent with the stored partitions let mut node_usage = vec![0; MAX_NODE_NUMBER]; for n in self.ring_assignation_data.iter() { node_usage[*n as usize] += 1; @@ -380,8 +386,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that the partition size stored is the one computed by the asignation - //algorithm. + // Check that the partition size stored is the one computed by the asignation + // algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); match cl2.compute_optimal_partition_size(&zone_to_id) { @@ -394,7 +400,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } -//Implementation of the ClusterLayout methods related to the assignation algorithm. +// Implementation of the ClusterLayout methods related to the assignation algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor @@ -403,16 +409,13 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - // Staged role changes must be merged with nodes roles before calling this function, - // hence it must only be called from apply_staged_changes() and hence is not public. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. fn calculate_partition_assignation(&mut self) -> Result { - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with new ids + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignation reframed with new ids let old_assignation_opt = self.update_node_id_vec()?; - //We update the parameters - self.parameters = self.staged_parameters.get().clone(); - let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("".into()); @@ -422,8 +425,8 @@ impl ClusterLayout { self.replication_factor, self.parameters.zone_redundancy )); - //We generate for once numerical ids for the zones of non gateway nodes, - //to use them as indices in the flow graphs. + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; let nb_nongateway_nodes = self.nongateway_nodes().len(); @@ -443,10 +446,10 @@ impl ClusterLayout { ))); } - //We compute the optimal partition size - //Capacities should be given in a unit so that partition size is at least 100. - //In this case, integer rounding plays a marginal role in the percentages of - //optimality. + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; if old_assignation_opt != None { @@ -461,7 +464,7 @@ impl ClusterLayout { partition_size )); } - //We write the partition size. + // We write the partition size. self.partition_size = partition_size; if partition_size < 100 { @@ -472,15 +475,15 @@ impl ClusterLayout { ); } - //We compute a first flow/assignation that is heuristically close to the previous - //assignation + // We compute a first flow/assignation that is heuristically close to the previous + // assignation let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignation. + // We minimize the distance to the previous assignation. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } - //We display statistics of the computation + // We display statistics of the computation msg.append(&mut self.output_stat( &gflow, &old_assignation_opt, @@ -489,7 +492,7 @@ impl ClusterLayout { )?); msg.push("".to_string()); - //We update the layout structure + // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; if !self.check() { @@ -508,8 +511,8 @@ impl ClusterLayout { /// do modify assignation_ring and node_id_vec. fn update_node_id_vec(&mut self) -> Result>>, Error> { // (1) We compute the new node list - //Non gateway nodes should be coded on 8bits, hence they must be first in the list - //We build the new node ids + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids let mut new_non_gateway_nodes: Vec = self .roles .items() @@ -542,12 +545,12 @@ impl ClusterLayout { self.node_id_vec = new_node_id_vec.clone(); // (2) We retrieve the old association - //We rewrite the old association with the new indices. We only consider partition - //to node assignations where the node is still in use. + // We rewrite the old association with the new indices. We only consider partition + // to node assignations where the node is still in use. let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; if self.ring_assignation_data.is_empty() { - //This is a new association + // This is a new association return Ok(None); } if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { @@ -558,11 +561,11 @@ impl ClusterLayout { )); } - //We build a translation table between the uuid and new ids + // We build a translation table between the uuid and new ids let mut uuid_to_new_id = HashMap::::new(); - //We add the indices of only the new non-gateway nodes that can be used in the - //association ring + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring for (i, uuid) in new_node_id_vec.iter().enumerate() { uuid_to_new_id.insert(*uuid, i); } @@ -577,14 +580,14 @@ impl ClusterLayout { } } - //We write the ring + // We write the ring self.ring_assignation_data = Vec::::new(); Ok(Some(old_assignation)) } - ///This function generates ids for the zone of the nodes appearing in - ///self.node_id_vec. + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); @@ -607,8 +610,8 @@ impl ClusterLayout { Ok((id_to_zone, zone_to_id)) } - ///This function computes by dichotomy the largest realizable partition size, given - ///the layout roles and parameters. + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, @@ -662,13 +665,13 @@ impl ClusterLayout { vertices } - ///Generates the graph to compute the maximal flow corresponding to the optimal - ///partition assignation. - ///exclude_assoc is the set of (partition, node) association that we are forbidden - ///to use (hence we do not add the corresponding edge to the graph). This parameter - ///is used to compute a first flow that uses only edges appearing in the previous - ///assignation. This produces a solution that heuristically should be close to the - ///previous one. + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignation. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignation. This produces a solution that heuristically should be close to the + /// previous one. fn generate_flow_graph( &self, partition_size: u32, @@ -709,14 +712,14 @@ impl ClusterLayout { Ok(g) } - ///This function computes a first optimal assignation (in the form of a flow graph). + /// This function computes a first optimal assignation (in the form of a flow graph). fn compute_candidate_assignation( &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, ) -> Result, Error> { - //We list the (partition,node) associations that are not used in the - //previous assignation + // We list the (partition,node) associations that are not used in the + // previous assignation let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { let nb_nodes = self.nongateway_nodes().len(); @@ -730,13 +733,13 @@ impl ClusterLayout { } } - //We compute the best flow using only the edges used in the previous assignation + // We compute the best flow using only the edges used in the previous assignation let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; - //We add the excluded edges and compute the maximal flow with the full graph. - //The algorithm is such that it will start with the flow that we just computed - //and find ameliorating paths from that. + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; @@ -745,16 +748,16 @@ impl ClusterLayout { Ok(g) } - ///This function updates the flow graph gflow to minimize the distance between - ///its corresponding assignation and the previous one + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignation and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, zone_to_id: &HashMap, prev_assign: &[Vec], ) -> Result<(), Error> { - //We define a cost function on the edges (pairs of vertices) corresponding - //to the distance between the two assignations. + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignations. let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { @@ -763,9 +766,9 @@ impl ClusterLayout { } } - //We compute the maximal length of a simple path in gflow. It is used in the - //Bellman-Ford algorithm in optimize_flow_with_cost to set the number - //of iterations. + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -773,7 +776,7 @@ impl ClusterLayout { Ok(()) } - ///This function updates the assignation ring from the flow graph. + /// This function updates the assignation ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, @@ -801,8 +804,8 @@ impl ClusterLayout { Ok(()) } - ///This function returns a message summing up the partition repartition of the new - ///layout, and other statistics of the partition assignation computation. + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignation computation. fn output_stat( &self, gflow: &Graph, @@ -837,7 +840,7 @@ impl ClusterLayout { used_cap / self.replication_factor as u32 )); - //We define and fill in the following tables + // We define and fill in the following tables let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -879,7 +882,7 @@ impl ClusterLayout { new_partitions_zone = stored_partitions_zone.clone(); } - //We display the statistics + // We display the statistics msg.push("".into()); if *prev_assign_opt != None { @@ -951,27 +954,27 @@ impl ClusterLayout { } } -//==================================================================================== +// ==================================================================================== #[cfg(test)] mod tests { use super::{Error, *}; use std::cmp::min; - //This function checks that the partition size S computed is at least better than the - //one given by a very naive algorithm. To do so, we try to run the naive algorithm - //assuming a partion size of S+1. If we succed, it means that the optimal assignation - //was not optimal. The naive algorithm is the following : - //- we compute the max number of partitions associated to every node, capped at the - //partition number. It gives the number of tokens of every node. - //- every zone has a number of tokens equal to the sum of the tokens of its nodes. - //- we cycle over the partitions and associate zone tokens while respecting the - //zone redundancy constraint. - //NOTE: the naive algorithm is not optimal. Counter example: - //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - //With these parameters, the naive algo fails, whereas there is a solution: - //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignation + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) fn check_against_naive(cl: &ClusterLayout) -> Result { let over_size = cl.partition_size + 1; let mut zone_token = HashMap::::new(); @@ -994,8 +997,8 @@ mod tests { ); } - //For every partition, we count the number of zone already associated and - //the name of the last zone associated + // For every partition, we count the number of zone already associated and + // the name of the last zone associated let mut id_zone_token = vec![0; zones.len()]; for (z, t) in zone_token.iter() { @@ -1049,7 +1052,7 @@ mod tests { cl.node_id_vec.push(x); } - let update = cl.staging.update_mutator( + let update = cl.staging_roles.update_mutator( cl.node_id_vec[i], NodeRoleV(Some(NodeRole { zone: (node_zone_vec[i].to_string()), @@ -1057,10 +1060,10 @@ mod tests { tags: (vec![]), })), ); - cl.staging.merge(&update); + cl.staging_roles.merge(&update); } - cl.staging_hash = blake2sum(&rmp_to_vec_all_named(&cl.staging).unwrap()[..]); - cl.staged_parameters + cl.staging_hash = cl.calculate_staging_hash(); + cl.staging_parameters .update(LayoutParameters { zone_redundancy }); } -- cgit v1.2.3 From fd5bc142b553d716c8265d83cff0bb633aa09e6b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 20:29:25 +0100 Subject: Ensure .sort() is called before counting unique items --- src/rpc/layout.rs | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 95f69dc8..15765662 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -355,17 +355,22 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + nodes_of_p.sort(); if nodes_of_p.iter().unique().count() != rf { return false; } // Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p.iter().map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }); + let mut zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::>(); + zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; - if zones_of_p.unique().count() < redundancy { + if zones_of_p.iter().unique().count() < redundancy { return false; } } @@ -378,9 +383,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for (n, usage) in node_usage.iter().enumerate() { if *usage > 0 { let uuid = self.node_id_vec[n]; - if usage * self.partition_size - > self.get_node_capacity(&uuid).expect("Critical Error") - { + if usage * self.partition_size > self.get_node_capacity(&uuid).unwrap() { return false; } } @@ -389,7 +392,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that the partition size stored is the one computed by the asignation // algorithm. let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); match cl2.compute_optimal_partition_size(&zone_to_id) { Ok(s) if s != self.partition_size => return false, Err(_) => return false, @@ -484,12 +487,7 @@ impl ClusterLayout { } // We display statistics of the computation - msg.append(&mut self.output_stat( - &gflow, - &old_assignation_opt, - &zone_to_id, - &id_to_zone, - )?); + msg.extend(self.output_stat(&gflow, &old_assignation_opt, &zone_to_id, &id_to_zone)?); msg.push("".to_string()); // We update the layout structure -- cgit v1.2.3 From 73a4ca8b1515f95bf7860fc292c12db83d3c6228 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 21:12:11 +0100 Subject: Use bytes as capacity units --- src/rpc/layout.rs | 62 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 15765662..3c80b213 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; -use hex::ToHex; +use bytesize::ByteSize; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -32,7 +32,7 @@ pub struct ClusterLayout { /// This attribute is only used to retain the previously computed partition size, /// to know to what extent does it change with the layout update. - pub partition_size: u32, + pub partition_size: u64, /// Parameters used to compute the assignation currently given by /// ring_assignation_data pub parameters: LayoutParameters, @@ -86,8 +86,7 @@ pub struct NodeRole { /// The capacity of the node /// If this is set to None, the node does not participate in storing data for the system /// and is only active as an API gateway to other nodes - // TODO : change the capacity to u64 and use byte unit input/output - pub capacity: Option, + pub capacity: Option, /// A set of tags to recognize the node pub tags: Vec, } @@ -95,7 +94,7 @@ pub struct NodeRole { impl NodeRole { pub fn capacity_string(&self) -> String { match self.capacity { - Some(c) => format!("{}", c), + Some(c) => ByteSize::b(c).to_string_as(false), None => "gateway".to_string(), } } @@ -264,7 +263,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), @@ -300,7 +299,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -458,13 +457,14 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!( "Optimal size of a partition: {} (was {} in the previous layout).", - partition_size, self.partition_size + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) )); } else { msg.push(format!( "Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", - partition_size + ByteSize::b(partition_size).to_string_as(false) )); } // We write the partition size. @@ -613,7 +613,7 @@ impl ClusterLayout { fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, - ) -> Result { + ) -> Result { let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; @@ -672,7 +672,7 @@ impl ClusterLayout { /// previous one. fn generate_flow_graph( &self, - partition_size: u32, + partition_size: u64, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, ) -> Result, Error> { @@ -682,18 +682,18 @@ impl ClusterLayout { let nb_zones = zone_to_id.len(); let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; g.add_edge( Vertex::Source, Vertex::Pdown(p), - (self.replication_factor - redundancy) as u32, + (self.replication_factor - redundancy) as u64, )?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; g.add_edge( Vertex::Pdown(p), Vertex::PZ(p, z), - self.replication_factor as u32, + self.replication_factor as u64, )?; } } @@ -813,17 +813,19 @@ impl ClusterLayout { ) -> Result { let mut msg = Message::new(); - let used_cap = self.partition_size * NB_PARTITIONS as u32 * self.replication_factor as u32; + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push("".into()); msg.push(format!( "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", - used_cap, total_cap, percent_cap + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap )); msg.push("".into()); msg.push( - "If the percentage is to low, it might be that the \ + "If the percentage is too low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ @@ -833,9 +835,9 @@ impl ClusterLayout { msg.push(format!( "Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", - used_cap, + ByteSize::b(used_cap).to_string_as(false), self.replication_factor, - used_cap / self.replication_factor as u32 + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) )); // We define and fill in the following tables @@ -914,34 +916,34 @@ impl ClusterLayout { replicated_partitions )); - let available_cap_z: u32 = self.partition_size * replicated_partitions as u32; + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); msg.push(format!( - " Usable capacity / Total capacity: {}/{} ({:.1}%).", - available_cap_z, total_cap_z, percent_cap_z + " Usable capacity / Total capacity: {} / {} ({:.1}%).", + ByteSize::b(available_cap_z).to_string_as(false), + ByteSize::b(total_cap_z).to_string_as(false), + percent_cap_z )); for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u32 * self.partition_size; + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; let tags_n = (self .node_role(&self.node_id_vec[*n]) .ok_or("Node not found."))? .tags_string(); msg.push(format!( - " Node {}: {} partitions ({} new) ; \ + " Node {:?}: {} partitions ({} new) ; \ usable/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec()[0..2] - .to_vec() - .encode_hex::(), + self.node_id_vec[*n], stored_partitions[*n], new_partitions[*n], - available_cap_n, - total_cap_n, + ByteSize::b(available_cap_n).to_string_as(false), + ByteSize::b(total_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, tags_n )); @@ -1041,7 +1043,7 @@ mod tests { fn update_layout( cl: &mut ClusterLayout, node_id_vec: &Vec, - node_capacity_vec: &Vec, + node_capacity_vec: &Vec, node_zone_vec: &Vec, zone_redundancy: usize, ) { -- cgit v1.2.3 From d75b37b018fc0ce8e3832c8531d9556ff7a345c9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 14:23:08 +0100 Subject: Return more info when layout's .check() fails, fix compilation, fix test --- src/rpc/layout.rs | 70 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 24 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 3c80b213..2f4dc129 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -187,11 +187,11 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.retain(|(_, _, v)| v.0.is_some()); self.parameters = self.staging_parameters.get().clone(); - let msg = self.calculate_partition_assignation()?; - self.staging_roles.clear(); self.staging_hash = self.calculate_staging_hash(); + let msg = self.calculate_partition_assignation()?; + self.version += 1; Ok((self, msg)) @@ -214,8 +214,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); self.staging_parameters.update(self.parameters.clone()); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -310,11 +310,11 @@ To know the correct value of the new layout version, invoke `garage layout show` /// Check a cluster layout for internal consistency /// (assignation, roles, parameters, partition size) /// returns true if consistent, false if error - pub fn check(&self) -> bool { + pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { - return false; + return Err("staging_hash is incorrect".into()); } // Check that node_id_vec contains the correct list of nodes @@ -329,12 +329,17 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut node_id_vec = self.node_id_vec.clone(); node_id_vec.sort(); if expected_nodes != node_id_vec { - return false; + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } // Check that the assignation data has the correct length - if self.ring_assignation_data.len() != (1 << PARTITION_BITS) * self.replication_factor { - return false; + let expected_assignation_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignation_data.len() != expected_assignation_data_len { + return Err(format!( + "ring_assignation_data has incorrect length {} instead of {}", + self.ring_assignation_data.len(), + expected_assignation_data_len + )); } // Check that the assigned nodes are correct identifiers @@ -342,12 +347,15 @@ To know the correct value of the new layout version, invoke `garage layout show` // and that role is not the role of a gateway nodes for x in self.ring_assignation_data.iter() { if *x as usize >= self.node_id_vec.len() { - return false; + return Err(format!( + "ring_assignation_data contains invalid node id {}", + *x + )); } let node = self.node_id_vec[*x as usize]; match self.roles.get(&node) { Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return false, + _ => return Err("ring_assignation_data contains id of a gateway node".into()), } } @@ -357,7 +365,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); nodes_of_p.sort(); if nodes_of_p.iter().unique().count() != rf { - return false; + return Err(format!("partition does not contain {} unique node ids", rf)); } // Check that every partition is spread over at least zone_redundancy zones. let mut zones_of_p = nodes_of_p @@ -370,7 +378,10 @@ To know the correct value of the new layout version, invoke `garage layout show` zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; if zones_of_p.iter().unique().count() < redundancy { - return false; + return Err(format!( + "nodes of partition are in less than {} distinct zones", + redundancy + )); } } @@ -382,8 +393,14 @@ To know the correct value of the new layout version, invoke `garage layout show` for (n, usage) in node_usage.iter().enumerate() { if *usage > 0 { let uuid = self.node_id_vec[n]; - if usage * self.partition_size > self.get_node_capacity(&uuid).unwrap() { - return false; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); } } } @@ -393,12 +410,17 @@ To know the correct value of the new layout version, invoke `garage layout show` let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); match cl2.compute_optimal_partition_size(&zone_to_id) { - Ok(s) if s != self.partition_size => return false, - Err(_) => return false, + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), _ => (), } - true + Ok(()) } } @@ -493,9 +515,9 @@ impl ClusterLayout { // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; - if !self.check() { + if let Err(e) = self.check() { return Err(Error::Message( - "Critical error: The computed layout happens to be incorrect".into(), + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) )); } @@ -1062,9 +1084,9 @@ mod tests { ); cl.staging_roles.merge(&update); } - cl.staging_hash = cl.calculate_staging_hash(); cl.staging_parameters .update(LayoutParameters { zone_redundancy }); + cl.staging_hash = cl.calculate_staging_hash(); } #[test] @@ -1081,7 +1103,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; @@ -1094,7 +1116,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; @@ -1102,7 +1124,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![ @@ -1112,7 +1134,7 @@ mod tests { let v = cl.version; let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); } } -- cgit v1.2.3 From ec12d6c8ddde0f1dc908e43fef0ecc88d1e5406b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 16:15:45 +0100 Subject: Slightly simplify code at places --- src/rpc/layout.rs | 61 ++++++++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 43 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 2f4dc129..133e33c8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -100,16 +100,7 @@ impl NodeRole { } pub fn tags_string(&self) -> String { - let mut tags = String::new(); - if self.tags.is_empty() { - return tags; - } - tags.push_str(&self.tags[0].clone()); - for t in 1..self.tags.len() { - tags.push(','); - tags.push_str(&self.tags[t].clone()); - } - tags + self.tags.join(",") } } @@ -241,7 +232,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn nongateway_nodes(&self) -> Vec { + fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { @@ -253,7 +244,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid: &Uuid) -> Result { + fn get_node_zone(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(role) => Ok(role.zone.clone()), _ => Err(Error::Message( @@ -299,7 +290,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -494,8 +485,7 @@ impl ClusterLayout { if partition_size < 100 { msg.push( - "WARNING: The partition size is low (< 100), you might consider to \ - provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb)." + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" .into(), ); } @@ -533,7 +523,7 @@ impl ClusterLayout { // (1) We compute the new node list // Non gateway nodes should be coded on 8bits, hence they must be first in the list // We build the new node ids - let mut new_non_gateway_nodes: Vec = self + let new_non_gateway_nodes: Vec = self .roles .items() .iter() @@ -549,7 +539,7 @@ impl ClusterLayout { ))); } - let mut new_gateway_nodes: Vec = self + let new_gateway_nodes: Vec = self .roles .items() .iter() @@ -558,8 +548,8 @@ impl ClusterLayout { .collect(); let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.append(&mut new_non_gateway_nodes); - new_node_id_vec.append(&mut new_gateway_nodes); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); let old_node_id_vec = self.node_id_vec.clone(); self.node_id_vec = new_node_id_vec.clone(); @@ -567,12 +557,11 @@ impl ClusterLayout { // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition // to node assignations where the node is still in use. - let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; - if self.ring_assignation_data.is_empty() { // This is a new association return Ok(None); } + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "The old assignation does not have a size corresponding to \ @@ -590,7 +579,9 @@ impl ClusterLayout { uuid_to_new_id.insert(*uuid, i); } + let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; let rf = self.replication_factor; + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { let uuid = old_node_id_vec[*old_id as usize]; @@ -613,18 +604,10 @@ impl ClusterLayout { let mut zone_to_id = HashMap::::new(); for uuid in self.nongateway_nodes().iter() { - if self.roles.get(uuid) == None { - return Err(Error::Message( - "The uuid was not found in the node roles (this should \ - not happen, it might be a critical error)." - .into(), - )); - } - if let Some(r) = self.node_role(uuid) { - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone(), id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); } } Ok((id_to_zone, zone_to_id)) @@ -639,11 +622,7 @@ impl ClusterLayout { let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; - if g.get_flow_value()? - < (NB_PARTITIONS * self.replication_factor) - .try_into() - .unwrap() - { + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { return Err(Error::Message( "The storage capacity of he cluster is to small. It is \ impossible to store partitions of size 1." @@ -656,11 +635,7 @@ impl ClusterLayout { while s_down + 1 < s_up { g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; - if g.get_flow_value()? - < (NB_PARTITIONS * self.replication_factor) - .try_into() - .unwrap() - { + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { s_up = (s_down + s_up) / 2; } else { s_down = (s_down + s_up) / 2; -- cgit v1.2.3 From 9d83364ad911b414e6e8eb56f75bf4bfb2d36239 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 11 Dec 2022 18:30:02 +0100 Subject: itertools .unique() doesn't require sorted items --- src/rpc/layout.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 133e33c8..1cef44d1 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -353,20 +353,18 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { - let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); - nodes_of_p.sort(); + let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); if nodes_of_p.iter().unique().count() != rf { return Err(format!("partition does not contain {} unique node ids", rf)); } // Check that every partition is spread over at least zone_redundancy zones. - let mut zones_of_p = nodes_of_p + let zones_of_p = nodes_of_p .iter() .map(|n| { self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.") }) .collect::>(); - zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; if zones_of_p.iter().unique().count() < redundancy { return Err(format!( -- cgit v1.2.3 From cb07e6145cf26a9bbbe44fd06090a099030d0750 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Thu, 5 Jan 2023 11:09:25 +0000 Subject: Changed all instances of assignation to assignment. --- src/rpc/layout.rs | 128 +++++++++++++++++++++++++++--------------------------- 1 file changed, 64 insertions(+), 64 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d756f0aa..c471420c 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -34,8 +34,8 @@ pub struct ClusterLayout { /// This attribute is only used to retain the previously computed partition size, /// to know to what extent does it change with the layout update. pub partition_size: u64, - /// Parameters used to compute the assignation currently given by - /// ring_assignation_data + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -48,12 +48,12 @@ pub struct ClusterLayout { /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, - /// the assignation of data partitions to node, the values + /// the assignment of data partitions to node, the values /// are indices in node_id_vec #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec, + pub ring_assignment_data: Vec, - /// Parameters to be used in the next partition assignation computation. + /// Parameters to be used in the next partition assignment computation. pub staging_parameters: Lww, /// Role changes which are staged for the next version of the layout pub staging_roles: LwwMap, @@ -61,7 +61,7 @@ pub struct ClusterLayout { } impl garage_util::migrate::InitialFormat for ClusterLayout {} -/// This struct is used to set the parameters to be used in the assignation computation +/// This struct is used to set the parameters to be used in the assignment computation /// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { @@ -106,7 +106,7 @@ impl NodeRole { } } -// Implementation of the ClusterLayout methods unrelated to the assignation algorithm. +// Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { // We set the default zone redundancy to be equal to the replication factor, @@ -124,7 +124,7 @@ impl ClusterLayout { partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), - ring_assignation_data: Vec::new(), + ring_assignment_data: Vec::new(), parameters, staging_parameters, staging_roles: empty_lwwmap, @@ -183,7 +183,7 @@ To know the correct value of the new layout version, invoke `garage layout show` self.staging_roles.clear(); self.staging_hash = self.calculate_staging_hash(); - let msg = self.calculate_partition_assignation()?; + let msg = self.calculate_partition_assignment()?; self.version += 1; @@ -276,7 +276,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for (i, id) in self.node_id_vec.iter().enumerate() { if id == uuid { let mut count = 0; - for nod in self.ring_assignation_data.iter() { + for nod in self.ring_assignment_data.iter() { if i as u8 == *nod { count += 1 } @@ -301,7 +301,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Check a cluster layout for internal consistency - /// (assignation, roles, parameters, partition size) + /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct @@ -325,37 +325,37 @@ To know the correct value of the new layout version, invoke `garage layout show` return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } - // Check that the assignation data has the correct length - let expected_assignation_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignation_data.len() != expected_assignation_data_len { + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { return Err(format!( - "ring_assignation_data has incorrect length {} instead of {}", - self.ring_assignation_data.len(), - expected_assignation_data_len + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len )); } // Check that the assigned nodes are correct identifiers // of nodes that are assigned a role // and that role is not the role of a gateway nodes - for x in self.ring_assignation_data.iter() { + for x in self.ring_assignment_data.iter() { if *x as usize >= self.node_id_vec.len() { return Err(format!( - "ring_assignation_data contains invalid node id {}", + "ring_assignment_data contains invalid node id {}", *x )); } let node = self.node_id_vec[*x as usize]; match self.roles.get(&node) { Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return Err("ring_assignation_data contains id of a gateway node".into()), + _ => return Err("ring_assignment_data contains id of a gateway node".into()), } } // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); if nodes_of_p.iter().unique().count() != rf { return Err(format!("partition does not contain {} unique node ids", rf)); } @@ -378,7 +378,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that the nodes capacities is consistent with the stored partitions let mut node_usage = vec![0; MAX_NODE_NUMBER]; - for n in self.ring_assignation_data.iter() { + for n in self.ring_assignment_data.iter() { node_usage[*n as usize] += 1; } for (n, usage) in node_usage.iter().enumerate() { @@ -415,21 +415,21 @@ To know the correct value of the new layout version, invoke `garage layout show` } } -// Implementation of the ClusterLayout methods related to the assignation algorithm. +// Implementation of the ClusterLayout methods related to the assignment algorithm. impl ClusterLayout { - /// This function calculates a new partition-to-node assignation. - /// The computed assignation respects the node replication factor + /// This function calculates a new partition-to-node assignment. + /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). - /// Among such optimal assignation, it minimizes the distance to - /// the former assignation (if any) to minimize the amount of + /// Among such optimal assignment, it minimizes the distance to + /// the former assignment (if any) to minimize the amount of /// data to be moved. /// Staged role changes must be merged with nodes roles before calling this function, /// hence it must only be called from apply_staged_changes() and hence is not public. - fn calculate_partition_assignation(&mut self) -> Result { + fn calculate_partition_assignment(&mut self) -> Result { // We update the node ids, since the node role list might have changed with the - // changes in the layout. We retrieve the old_assignation reframed with new ids - let old_assignation_opt = self.update_node_id_vec()?; + // changes in the layout. We retrieve the old_assignment reframed with new ids + let old_assignment_opt = self.update_node_id_vec()?; let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); @@ -467,7 +467,7 @@ impl ClusterLayout { // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; - if old_assignation_opt != None { + if old_assignment_opt != None { msg.push(format!( "Optimal size of a partition: {} (was {} in the previous layout).", ByteSize::b(partition_size).to_string_as(false), @@ -490,16 +490,16 @@ impl ClusterLayout { ); } - // We compute a first flow/assignation that is heuristically close to the previous - // assignation - let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { - // We minimize the distance to the previous assignation. + // We compute a first flow/assignment that is heuristically close to the previous + // assignment + let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?; + if let Some(assoc) = &old_assignment_opt { + // We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } // We display statistics of the computation - msg.extend(self.output_stat(&gflow, &old_assignation_opt, &zone_to_id, &id_to_zone)?); + msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); msg.push("".to_string()); // We update the layout structure @@ -515,10 +515,10 @@ impl ClusterLayout { } /// The LwwMap of node roles might have changed. This function updates the node_id_vec - /// and returns the assignation given by ring, with the new indices of the nodes, and + /// and returns the assignment given by ring, with the new indices of the nodes, and /// None if the node is not present anymore. - /// We work with the assumption that only this function and calculate_new_assignation - /// do modify assignation_ring and node_id_vec. + /// We work with the assumption that only this function and calculate_new_assignment + /// do modify assignment_ring and node_id_vec. fn update_node_id_vec(&mut self) -> Result>>, Error> { // (1) We compute the new node list // Non gateway nodes should be coded on 8bits, hence they must be first in the list @@ -556,15 +556,15 @@ impl ClusterLayout { // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition - // to node assignations where the node is still in use. - if self.ring_assignation_data.is_empty() { + // to node assignments where the node is still in use. + if self.ring_assignment_data.is_empty() { // This is a new association return Ok(None); } - if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( - "The old assignation does not have a size corresponding to \ + "The old assignment does not have a size corresponding to \ the old replication factor or the number of partitions." .into(), )); @@ -579,11 +579,11 @@ impl ClusterLayout { uuid_to_new_id.insert(*uuid, i); } - let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; + let mut old_assignment = vec![Vec::::new(); NB_PARTITIONS]; let rf = self.replication_factor; - for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { - for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { + for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { + for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { old_assign_p.push(uuid_to_new_id[&uuid]); @@ -592,9 +592,9 @@ impl ClusterLayout { } // We write the ring - self.ring_assignation_data = Vec::::new(); + self.ring_assignment_data = Vec::::new(); - Ok(Some(old_assignation)) + Ok(Some(old_assignment)) } /// This function generates ids for the zone of the nodes appearing in @@ -661,11 +661,11 @@ impl ClusterLayout { } /// Generates the graph to compute the maximal flow corresponding to the optimal - /// partition assignation. + /// partition assignment. /// exclude_assoc is the set of (partition, node) association that we are forbidden /// to use (hence we do not add the corresponding edge to the graph). This parameter /// is used to compute a first flow that uses only edges appearing in the previous - /// assignation. This produces a solution that heuristically should be close to the + /// assignment. This produces a solution that heuristically should be close to the /// previous one. fn generate_flow_graph( &self, @@ -707,14 +707,14 @@ impl ClusterLayout { Ok(g) } - /// This function computes a first optimal assignation (in the form of a flow graph). - fn compute_candidate_assignation( + /// This function computes a first optimal assignment (in the form of a flow graph). + fn compute_candidate_assignment( &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, ) -> Result, Error> { // We list the (partition,node) associations that are not used in the - // previous assignation + // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { let nb_nodes = self.nongateway_nodes().len(); @@ -728,7 +728,7 @@ impl ClusterLayout { } } - // We compute the best flow using only the edges used in the previous assignation + // We compute the best flow using only the edges used in the previous assignment let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; @@ -744,7 +744,7 @@ impl ClusterLayout { } /// This function updates the flow graph gflow to minimize the distance between - /// its corresponding assignation and the previous one + /// its corresponding assignment and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, @@ -752,7 +752,7 @@ impl ClusterLayout { prev_assign: &[Vec], ) -> Result<(), Error> { // We define a cost function on the edges (pairs of vertices) corresponding - // to the distance between the two assignations. + // to the distance between the two assignments. let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { @@ -771,25 +771,25 @@ impl ClusterLayout { Ok(()) } - /// This function updates the assignation ring from the flow graph. + /// This function updates the assignment ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, gflow: &Graph, ) -> Result<(), Error> { - self.ring_assignation_data = Vec::::new(); + self.ring_assignment_data = Vec::::new(); for p in 0..NB_PARTITIONS { for z in 0..nb_zones { let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; for vertex in assoc_vertex.iter() { if let Vertex::N(n) = vertex { - self.ring_assignation_data.push((*n).try_into().unwrap()); + self.ring_assignment_data.push((*n).try_into().unwrap()); } } } } - if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "Critical Error : the association ring we produced does not \ have the right size." @@ -800,7 +800,7 @@ impl ClusterLayout { } /// This function returns a message summing up the partition repartition of the new - /// layout, and other statistics of the partition assignation computation. + /// layout, and other statistics of the partition assignment computation. fn output_stat( &self, gflow: &Graph, @@ -960,7 +960,7 @@ mod tests { // This function checks that the partition size S computed is at least better than the // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignation + // assuming a partion size of S+1. If we succed, it means that the optimal assignment // was not optimal. The naive algorithm is the following : // - we compute the max number of partitions associated to every node, capped at the // partition number. It gives the number of tokens of every node. @@ -1065,7 +1065,7 @@ mod tests { } #[test] - fn test_assignation() { + fn test_assignment() { let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] -- cgit v1.2.3 From 84b4a868e3c5a6606bc61b9f944f8aa12a3233c1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 11 Jan 2023 17:47:46 +0100 Subject: Migration of cluster layout from v0.8 to v0.9 --- src/rpc/layout.rs | 238 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 178 insertions(+), 60 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index c471420c..b6c2fd27 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -5,8 +5,6 @@ use std::collections::HashSet; use bytesize::ByteSize; use itertools::Itertools; -use serde::{Deserialize, Serialize}; - use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::encode::nonversioned_encode; @@ -23,76 +21,196 @@ const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; // The Message type will be used to collect information on the algorithm. type Message = Vec; -/// The layout of the cluster, i.e. the list of roles -/// which are assigned to each cluster node -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers holding - /// in u8 (the number of non-gateway nodes is at most 256). - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec, - /// the assignment of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, - pub staging_hash: Hash, +mod v08 { + use crate::ring::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec, + /// the assignation of data partitions to node, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option, + /// A set of tags to recognize the node + pub tags: Vec, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} } -impl garage_util::migrate::InitialFormat for ClusterLayout {} -/// This struct is used to set the parameters to be used in the assignment computation -/// algorithm. It is stored as a Crdt. -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct LayoutParameters { - pub zone_redundancy: usize, +mod v09 { + use super::v08; + use crate::ring::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + /// see comment in v08::ClusterLayout + pub node_id_vec: Vec, + /// see comment in v08::ClusterLayout + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + pub staging_hash: Hash, + } + + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: usize, + } + + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"Glayout09"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + use std::collections::HashSet; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // Determine zone redundancy parameter + let zone_redundancy = std::cmp::min( + previous.replication_factor, + roles + .items() + .iter() + .filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str())) + .collect::>() + .len(), + ); + let parameters = LayoutParameters { zone_redundancy }; + + let mut res = Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), + }; + res.staging_hash = res.calculate_staging_hash(); + res + } + } + + fn multiply_all_capacities( + old_roles: LwwMap, + mul: u64, + ) -> LwwMap { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap = *cap * mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } } +pub use v09::*; + impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct NodeRoleV(pub Option); - impl AutoCrdt for NodeRoleV { const WARN_IF_DIFFERENT: bool = true; } -/// The user-assigned roles of cluster nodes -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct NodeRole { - /// Datacenter at which this entry belong. This information is used to - /// perform a better geodistribution - pub zone: String, - /// The capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option, - /// A set of tags to recognize the node - pub tags: Vec, -} - impl NodeRole { pub fn capacity_string(&self) -> String { match self.capacity { -- cgit v1.2.3 From 38d6ac429506f9f488ac522581b12fa530442a59 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 27 Apr 2023 17:57:54 +0200 Subject: New multipart upload table layout --- src/rpc/layout.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index b6c2fd27..c2655e59 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -119,7 +119,7 @@ mod v09 { } impl garage_util::migrate::Migrate for ClusterLayout { - const VERSION_MARKER: &'static [u8] = b"Glayout09"; + const VERSION_MARKER: &'static [u8] = b"G09layout"; type Previous = v08::ClusterLayout; -- cgit v1.2.3 From 2e229d44303bfafa22aaf0d4aa299021a937220e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 12 Sep 2023 17:24:51 +0200 Subject: new layout: improve output display --- src/rpc/layout.rs | 85 +++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 43 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index c2655e59..76d29b08 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -585,16 +585,16 @@ impl ClusterLayout { // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + msg.push("".into()); if old_assignment_opt != None { msg.push(format!( - "Optimal size of a partition: {} (was {} in the previous layout).", + "Optimal partition size: {} ({} in previous layout)", ByteSize::b(partition_size).to_string_as(false), ByteSize::b(self.partition_size).to_string_as(false) )); } else { msg.push(format!( - "Given the replication and redundancy constraints, the \ - optimal size of a partition is {}.", + "Optimal partition size: {}", ByteSize::b(partition_size).to_string_as(false) )); } @@ -618,7 +618,6 @@ impl ClusterLayout { // We display statistics of the computation msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); - msg.push("".to_string()); // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; @@ -931,29 +930,33 @@ impl ClusterLayout { let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); - msg.push("".into()); msg.push(format!( - "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", + "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", ByteSize::b(used_cap).to_string_as(false), ByteSize::b(total_cap).to_string_as(false), percent_cap )); - msg.push("".into()); - msg.push( - "If the percentage is too low, it might be that the \ - replication/redundancy constraints force the use of nodes/zones with small \ - storage capacities. \ - You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones." - .into(), - ); msg.push(format!( - "Recall that because of the replication factor, the actual available \ - storage capacity is {} / {} = {}.", - ByteSize::b(used_cap).to_string_as(false), + "Effective capacity (replication factor {}): {}", self.replication_factor, ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) )); + if percent_cap < 80. { + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + replication/redundancy constraints force the use of nodes/zones with small \ + storage capacities." + .into(), + ); + msg.push( + "You might want to rebalance the storage capacities or relax the constraints." + .into(), + ); + msg.push( + "See the detailed statistics below and look for saturated nodes/zones.".into(), + ); + } // We define and fill in the following tables let storing_nodes = self.nongateway_nodes(); @@ -1007,10 +1010,10 @@ impl ClusterLayout { transferred.", total_new_partitions )); + msg.push("".into()); } - msg.push("".into()); - msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); + let mut table = vec![]; for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::::new(); for n in 0..storing_nodes.len() { @@ -1020,15 +1023,9 @@ impl ClusterLayout { } let replicated_partitions: usize = nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); - msg.push("".into()); - - msg.push(format!( - "Zone {}: {} distinct partitions stored ({} new, \ - {} partition copies) ", - id_to_zone[z], - stored_partitions_zone[z], - new_partitions_zone[z], - replicated_partitions + table.push(format!( + "{}\tTags\tPartitions\tCapacity\tUsable capacity", + id_to_zone[z] )); let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; @@ -1037,33 +1034,35 @@ impl ClusterLayout { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); - msg.push(format!( - " Usable capacity / Total capacity: {} / {} ({:.1}%).", - ByteSize::b(available_cap_z).to_string_as(false), - ByteSize::b(total_cap_z).to_string_as(false), - percent_cap_z - )); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self - .node_role(&self.node_id_vec[*n]) - .ok_or("Node not found."))? - .tags_string(); - msg.push(format!( - " Node {:?}: {} partitions ({} new) ; \ - usable/total capacity: {} / {} ({:.1}%) ; tags:{}", + let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); + table.push(format!( + " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", self.node_id_vec[*n], + tags_n, stored_partitions[*n], new_partitions[*n], ByteSize::b(available_cap_n).to_string_as(false), ByteSize::b(total_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, - tags_n )); } + + table.push(format!( + " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", + replicated_partitions, + stored_partitions_zone[z], + //new_partitions_zone[z], + ByteSize::b(available_cap_z).to_string_as(false), + ByteSize::b(total_cap_z).to_string_as(false), + percent_cap_z + )); + table.push("".into()); } + msg.push(format_table::format_table_to_string(table)); Ok(msg) } -- cgit v1.2.3 From 015ccb39aa511c72d0c899713a828491871da3e7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 11:57:36 +0200 Subject: new layout: make zone_redundancy optionnal (if not set, is maximum) --- src/rpc/layout.rs | 131 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 35 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 76d29b08..9aa9c584 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,6 +1,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; +use std::fmt; use bytesize::ByteSize; use itertools::Itertools; @@ -115,7 +116,16 @@ mod v09 { /// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] pub struct LayoutParameters { - pub zone_redundancy: usize, + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// If None, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, } impl garage_util::migrate::Migrate for ClusterLayout { @@ -125,7 +135,6 @@ mod v09 { fn migrate(previous: Self::Previous) -> Self { use itertools::Itertools; - use std::collections::HashSet; // In the old layout, capacities are in an arbitrary unit, // but in the new layout they are in bytes. @@ -152,17 +161,10 @@ mod v09 { .min() .unwrap_or(0); - // Determine zone redundancy parameter - let zone_redundancy = std::cmp::min( - previous.replication_factor, - roles - .items() - .iter() - .filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str())) - .collect::>() - .len(), - ); - let parameters = LayoutParameters { zone_redundancy }; + // By default, zone_redundancy is None (i.e. maximum possible value) + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; let mut res = Self { version: previous.version, @@ -224,13 +226,37 @@ impl NodeRole { } } +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + // Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be equal to the replication factor, - // i.e. as strict as possible. + // We set the default zone redundancy to be None, meaning that the maximum + // possible value will be used depending on the cluster topology let parameters = LayoutParameters { - zone_redundancy: replication_factor, + zone_redundancy: ZoneRedundancy::Maximum, }; let staging_parameters = Lww::::new(parameters.clone()); @@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show` Ok(total_capacity) } + /// Returns the effective value of the zone_redundancy parameter + fn effective_zone_redundancy(&self) -> usize { + match self.parameters.zone_redundancy { + ZoneRedundancy::AtLeast(v) => v, + ZoneRedundancy::Maximum => { + let n_zones = self + .roles + .items() + .iter() + .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) + .collect::>() + .len(); + std::cmp::min(n_zones, self.replication_factor) + } + } + } + /// Check a cluster layout for internal consistency /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error @@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } // Check that every partition is associated to distinct nodes + let zone_redundancy = self.effective_zone_redundancy(); let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); @@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show` .expect("Zone not found.") }) .collect::>(); - let redundancy = self.parameters.zone_redundancy; - if zones_of_p.iter().unique().count() < redundancy { + if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( "nodes of partition are in less than {} distinct zones", - redundancy + zone_redundancy )); } } @@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); - match cl2.compute_optimal_partition_size(&zone_to_id) { + match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { Ok(s) if s != self.partition_size => { return Err(format!( "partition_size ({}) is different than optimal value ({})", @@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } } +// ==================================================================================== + // Implementation of the ClusterLayout methods related to the assignment algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignment. @@ -549,13 +594,15 @@ impl ClusterLayout { // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; + let zone_redundancy = self.effective_zone_redundancy(); + let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("".into()); msg.push(format!( "Partitions are \ replicated {} times on at least {} distinct zones.", - self.replication_factor, self.parameters.zone_redundancy + self.replication_factor, zone_redundancy )); // We generate for once numerical ids for the zones of non gateway nodes, @@ -570,12 +617,12 @@ impl ClusterLayout { nb_nongateway_nodes, self.replication_factor ))); } - if id_to_zone.len() < self.parameters.zone_redundancy { + if id_to_zone.len() < zone_redundancy { return Err(Error::Message(format!( "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", id_to_zone.len(), - self.parameters.zone_redundancy + zone_redundancy ))); } @@ -583,7 +630,7 @@ impl ClusterLayout { // Capacities should be given in a unit so that partition size is at least 100. // In this case, integer rounding plays a marginal role in the percentages of // optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; msg.push("".into()); if old_assignment_opt != None { @@ -610,7 +657,8 @@ impl ClusterLayout { // We compute a first flow/assignment that is heuristically close to the previous // assignment - let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?; + let mut gflow = + self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; if let Some(assoc) = &old_assignment_opt { // We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; @@ -735,9 +783,10 @@ impl ClusterLayout { fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, + zone_redundancy: usize, ) -> Result { let empty_set = HashSet::<(usize, usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { return Err(Error::Message( @@ -750,7 +799,12 @@ impl ClusterLayout { let mut s_down = 1; let mut s_up = self.get_total_capacity()?; while s_down + 1 < s_up { - g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g = self.generate_flow_graph( + (s_down + s_up) / 2, + zone_to_id, + &empty_set, + zone_redundancy, + )?; g.compute_maximal_flow()?; if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { s_up = (s_down + s_up) / 2; @@ -789,18 +843,18 @@ impl ClusterLayout { partition_size: u64, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, + zone_redundancy: usize, ) -> Result, Error> { let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; g.add_edge( Vertex::Source, Vertex::Pdown(p), - (self.replication_factor - redundancy) as u64, + (self.replication_factor - zone_redundancy) as u64, )?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; @@ -829,6 +883,7 @@ impl ClusterLayout { &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, + zone_redundancy: usize, ) -> Result, Error> { // We list the (partition,node) associations that are not used in the // previous assignment @@ -846,7 +901,12 @@ impl ClusterLayout { } // We compute the best flow using only the edges used in the previous assignment - let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + let mut g = self.generate_flow_graph( + self.partition_size, + zone_to_id, + &exclude_edge, + zone_redundancy, + )?; g.compute_maximal_flow()?; // We add the excluded edges and compute the maximal flow with the full graph. @@ -997,7 +1057,7 @@ impl ClusterLayout { if *prev_assign_opt == None { new_partitions = stored_partitions.clone(); - new_partitions_zone = stored_partitions_zone.clone(); + //new_partitions_zone = stored_partitions_zone.clone(); } // We display the statistics @@ -1124,7 +1184,7 @@ mod tests { let mut curr_zone = 0; - let redundancy = cl.parameters.zone_redundancy; + let redundancy = cl.effective_zone_redundancy(); for replic in 0..cl.replication_factor { for p in 0..NB_PARTITIONS { @@ -1176,8 +1236,9 @@ mod tests { ); cl.staging_roles.merge(&update); } - cl.staging_parameters - .update(LayoutParameters { zone_redundancy }); + cl.staging_parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); cl.staging_hash = cl.calculate_staging_hash(); } -- cgit v1.2.3 From 749b4865d0a26c600fef79ab0456c827faafb9e8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 12:07:45 +0200 Subject: new layout: improve display and fix comments --- src/rpc/layout.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 9aa9c584..c106114b 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -121,7 +121,7 @@ mod v09 { /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies /// of each partition on at least that number of different zones. - /// If None, copies will be stored on the maximum possible number of zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] pub enum ZoneRedundancy { AtLeast(usize), @@ -161,7 +161,7 @@ mod v09 { .min() .unwrap_or(0); - // By default, zone_redundancy is None (i.e. maximum possible value) + // By default, zone_redundancy is maximum possible value let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, }; @@ -253,7 +253,7 @@ impl core::str::FromStr for ZoneRedundancy { // Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be None, meaning that the maximum + // We set the default zone redundancy to be Maximum, meaning that the maximum // possible value will be used depending on the cluster topology let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, @@ -1005,12 +1005,12 @@ impl ClusterLayout { msg.push("".into()); msg.push( "If the percentage is too low, it might be that the \ - replication/redundancy constraints force the use of nodes/zones with small \ + cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ storage capacities." .into(), ); msg.push( - "You might want to rebalance the storage capacities or relax the constraints." + "You might want to move storage capacity between zones or relax the redundancy constraint." .into(), ); msg.push( @@ -1105,8 +1105,8 @@ impl ClusterLayout { tags_n, stored_partitions[*n], new_partitions[*n], - ByteSize::b(available_cap_n).to_string_as(false), ByteSize::b(total_cap_n).to_string_as(false), + ByteSize::b(available_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, )); } @@ -1116,8 +1116,8 @@ impl ClusterLayout { replicated_partitions, stored_partitions_zone[z], //new_partitions_zone[z], - ByteSize::b(available_cap_z).to_string_as(false), ByteSize::b(total_cap_z).to_string_as(false), + ByteSize::b(available_cap_z).to_string_as(false), percent_cap_z )); table.push("".into()); -- cgit v1.2.3 From 0088599f52f38ae9e00fe772a416150813e2470b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 12:17:07 +0200 Subject: new layout: fix clippy lints --- src/rpc/layout.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'src/rpc/layout.rs') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index c106114b..e02a180b 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -195,7 +195,7 @@ mod v09 { .. })) = role { - *cap = *cap * mul; + *cap *= mul; } new_roles.merge_raw(node, *ts, &role); } @@ -258,7 +258,7 @@ impl ClusterLayout { let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, }; - let staging_parameters = Lww::::new(parameters.clone()); + let staging_parameters = Lww::::new(parameters); let empty_lwwmap = LwwMap::new(); @@ -322,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging_roles); self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = self.staging_parameters.get().clone(); + self.parameters = *self.staging_parameters.get(); self.staging_roles.clear(); self.staging_hash = self.calculate_staging_hash(); @@ -351,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } self.staging_roles.clear(); - self.staging_parameters.update(self.parameters.clone()); + self.staging_parameters.update(self.parameters); self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -382,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { - Some(role) if role.capacity != None => result.push(*uuid), + Some(role) if role.capacity.is_some() => result.push(*uuid), _ => (), } } @@ -633,7 +633,7 @@ impl ClusterLayout { let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; msg.push("".into()); - if old_assignment_opt != None { + if old_assignment_opt.is_some() { msg.push(format!( "Optimal partition size: {} ({} in previous layout)", ByteSize::b(partition_size).to_string_as(false), @@ -692,7 +692,7 @@ impl ClusterLayout { .roles .items() .iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) .map(|(k, _, _)| *k) .collect(); @@ -708,7 +708,7 @@ impl ClusterLayout { .roles .items() .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) .map(|(k, _, _)| *k) .collect(); @@ -770,7 +770,7 @@ impl ClusterLayout { for uuid in self.nongateway_nodes().iter() { let r = self.node_role(uuid).unwrap(); - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); id_to_zone.push(r.zone.clone()); } @@ -1055,7 +1055,7 @@ impl ClusterLayout { } } - if *prev_assign_opt == None { + if prev_assign_opt.is_none() { new_partitions = stored_partitions.clone(); //new_partitions_zone = stored_partitions_zone.clone(); } @@ -1063,7 +1063,7 @@ impl ClusterLayout { // We display the statistics msg.push("".into()); - if *prev_assign_opt != None { + if prev_assign_opt.is_some() { let total_new_partitions: usize = new_partitions.iter().sum(); msg.push(format!( "A total of {} new copies of partitions need to be \ -- cgit v1.2.3