From c1d1646c4d62300ec48503aa65623ee7e3df8685 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 09:54:19 +0200 Subject: Change the way new layout assignations are computed. The function now computes an optimal assignation (with respect to partition size) that minimizes the distance to the former assignation, using flow algorithms. This commit was written by Mendes Oulamara --- src/rpc/Cargo.toml | 1 + src/rpc/layout.rs | 881 +++++++++++++++++++++++++------------------------- src/util/bipartite.rs | 378 ++++++++++++++++++++++ src/util/lib.rs | 1 + 4 files changed, 827 insertions(+), 434 deletions(-) create mode 100644 src/util/bipartite.rs (limited to 'src') diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index efaacf2e..654c1dc6 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -23,6 +23,7 @@ gethostname = "0.2" hex = "0.4" tracing = "0.1.30" rand = "0.8" +itertools="0.10" sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } async-trait = "0.1.7" diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index b9c02c21..afd7df17 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,10 +1,14 @@ use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; +use std::cmp::{min}; +use std::collections::{HashMap}; use serde::{Deserialize, Serialize}; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; +use garage_util::bipartite::*; + +use rand::prelude::SliceRandom; use crate::ring::*; @@ -164,445 +168,454 @@ impl ClusterLayout { true } - /// Calculate an assignation of partitions to nodes - pub fn calculate_partition_assignation(&mut self) -> bool { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); - - println!("Calculating updated partition assignation, this may take some time..."); - println!(); - - // Get old partition assignation - let old_partitions = self.parse_assignation_data(); - - // Start new partition assignation with nodes from old assignation where it is relevant - let mut partitions = old_partitions - .iter() - .map(|old_part| { - let mut new_part = PartitionAss::new(); - for node in old_part.nodes.iter() { - if let Some(role) = node.1 { - if role.capacity.is_some() { - new_part.add(None, n_zones, node.0, role); - } - } - } - new_part - }) - .collect::>(); - - // In various cases, not enough nodes will have been added for all partitions - // in the step above (e.g. due to node removals, or new zones being added). - // Here we add more nodes to make a complete (but sub-optimal) assignation, - // using an initial partition assignation that is calculated using the multi-dc maglev trick - match self.initial_partition_assignation() { - Some(initial_partitions) => { - for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) { - for (id, info) in ipart.nodes.iter() { - if part.nodes.len() < self.replication_factor { - part.add(None, n_zones, id, info.unwrap()); - } - } - assert!(part.nodes.len() == self.replication_factor); - } - } - None => { - // Not enough nodes in cluster to build a correct assignation. - // Signal it by returning an error. - return false; - } - } - - // Calculate how many partitions each node should ideally store, - // and how many partitions they are storing with the current assignation - // This defines our target for which we will optimize in the following loop. - let total_capacity = configured_nodes - .iter() - .map(|(_, info)| info.capacity.unwrap_or(0)) - .sum::() as usize; - let total_partitions = self.replication_factor * (1 << PARTITION_BITS); - let target_partitions_per_node = configured_nodes - .iter() - .map(|(id, info)| { - ( - *id, - info.capacity.unwrap_or(0) as usize * total_partitions / total_capacity, - ) - }) - .collect::>(); - - let mut partitions_per_node = self.partitions_per_node(&partitions[..]); - - println!("Target number of partitions per node:"); - for (node, npart) in target_partitions_per_node.iter() { - println!("{:?}\t{}", node, npart); - } - println!(); - - // Shuffle partitions between nodes so that nodes will reach (or better approach) - // their target number of stored partitions - loop { - let mut option = None; - for (i, part) in partitions.iter_mut().enumerate() { - for (irm, (idrm, _)) in part.nodes.iter().enumerate() { - let errratio = |node, parts| { - let tgt = *target_partitions_per_node.get(node).unwrap() as f32; - (parts - tgt) / tgt - }; - let square = |x| x * x; - - let partsrm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as f32; - - for (idadd, infoadd) in configured_nodes.iter() { - // skip replacing a node by itself - // and skip replacing by gateway nodes - if idadd == idrm || infoadd.capacity.is_none() { - continue; - } - - // We want to try replacing node idrm by node idadd - // if that brings us close to our goal. - let partsadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as f32; - let oldcost = square(errratio(*idrm, partsrm) - errratio(*idadd, partsadd)); - let newcost = - square(errratio(*idrm, partsrm - 1.) - errratio(*idadd, partsadd + 1.)); - if newcost >= oldcost { - // not closer to our goal - continue; - } - let gain = oldcost - newcost; - - let mut newpart = part.clone(); - - newpart.nodes.remove(irm); - if !newpart.add(None, n_zones, idadd, infoadd) { - continue; - } - assert!(newpart.nodes.len() == self.replication_factor); - - if !old_partitions[i] - .is_valid_transition_to(&newpart, self.replication_factor) - { - continue; - } - - if option - .as_ref() - .map(|(old_gain, _, _, _, _)| gain > *old_gain) - .unwrap_or(true) - { - option = Some((gain, i, idadd, idrm, newpart)); - } - } - } - } - if let Some((_gain, i, idadd, idrm, newpart)) = option { - *partitions_per_node.entry(idadd).or_insert(0) += 1; - *partitions_per_node.get_mut(idrm).unwrap() -= 1; - partitions[i] = newpart; - } else { - break; - } - } - // Check we completed the assignation correctly - // (this is a set of checks for the algorithm's consistency) - assert!(partitions.len() == (1 << PARTITION_BITS)); - assert!(partitions - .iter() - .all(|p| p.nodes.len() == self.replication_factor)); - - let new_partitions_per_node = self.partitions_per_node(&partitions[..]); - assert!(new_partitions_per_node == partitions_per_node); - - // Show statistics - println!("New number of partitions per node:"); - for (node, npart) in partitions_per_node.iter() { - let tgt = *target_partitions_per_node.get(node).unwrap(); - let pct = 100f32 * (*npart as f32) / (tgt as f32); - println!("{:?}\t{}\t({}% of {})", node, npart, pct as i32, tgt); - } - println!(); - - let mut diffcount = HashMap::new(); - for (oldpart, newpart) in old_partitions.iter().zip(partitions.iter()) { - let nminus = oldpart.txtplus(newpart); - let nplus = newpart.txtplus(oldpart); - if nminus != "[...]" || nplus != "[...]" { - let tup = (nminus, nplus); - *diffcount.entry(tup).or_insert(0) += 1; - } - } - if diffcount.is_empty() { - println!("No data will be moved between nodes."); - } else { - let mut diffcount = diffcount.into_iter().collect::>(); - diffcount.sort(); - println!("Number of partitions that move:"); - for ((nminus, nplus), npart) in diffcount { - println!("\t{}\t{} -> {}", npart, nminus, nplus); - } - } - println!(); - - // Calculate and save new assignation data - let (nodes, assignation_data) = - self.compute_assignation_data(&configured_nodes[..], &partitions[..]); - - self.node_id_vec = nodes; - self.ring_assignation_data = assignation_data; - - true - } - - fn initial_partition_assignation(&self) -> Option>> { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); - - // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) - let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::>(); - - // Prepare ring - let mut partitions: Vec = partitions_idx - .iter() - .map(|_i| PartitionAss::new()) - .collect::>(); - - // Create MagLev priority queues for each node - let mut queues = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(node_id, node_info)| { - let mut parts = partitions_idx - .iter() - .map(|i| { - let part_data = - [&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat(); - (*i, fasthash(&part_data[..])) - }) - .collect::>(); - parts.sort_by_key(|(_i, h)| *h); - let parts_i = parts.iter().map(|(i, _h)| *i).collect::>(); - (node_id, node_info, parts_i, 0) - }) - .collect::>(); - - let max_capacity = configured_nodes - .iter() - .filter_map(|(_, node_info)| node_info.capacity) - .fold(0, std::cmp::max); - - // Fill up ring - for rep in 0..self.replication_factor { - queues.sort_by_key(|(ni, _np, _q, _p)| { - let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat(); - fasthash(&queue_data[..]) - }); - - for (_, _, _, pos) in queues.iter_mut() { - *pos = 0; - } - - let mut remaining = partitions_idx.len(); - while remaining > 0 { - let remaining0 = remaining; - for i_round in 0..max_capacity { - for (node_id, node_info, q, pos) in queues.iter_mut() { - if i_round >= node_info.capacity.unwrap() { - continue; - } - for (pos2, &qv) in q.iter().enumerate().skip(*pos) { - if partitions[qv].add(Some(rep + 1), n_zones, node_id, node_info) { - remaining -= 1; - *pos = pos2 + 1; - break; - } - } - } - } - if remaining == remaining0 { - // No progress made, exit - return None; - } - } - } - - Some(partitions) - } + /// This function calculates a new partition-to-node assignation. + /// The computed assignation maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignation, it minimizes the distance to + /// the former assignation (if any) to minimize the amount of + /// data to be moved. A heuristic ensures node triplets + /// dispersion (in garage_util::bipartite::optimize_matching()). + pub fn calculate_partition_assignation(&mut self) -> bool { + + //The nodes might have been updated, some might have been deleted. + //So we need to first update the list of nodes and retrieve the + //assignation. + let old_node_assignation = self.update_nodes_and_ring(); + + let (node_zone, _) = self.get_node_zone_capacity(); + + //We compute the optimal number of partition to assign to + //every node and zone. + if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions(){ + //We collect part_per_zone in a vec to not rely on the + //arbitrary order in which elements are iterated in + //Hashmap::iter() + let part_per_zone_vec = part_per_zone.iter() + .map(|(x,y)| (x.clone(),*y)) + .collect::>(); + //We create an indexing of the zones + let mut zone_id = HashMap::::new(); + for i in 0..part_per_zone_vec.len(){ + zone_id.insert(part_per_zone_vec[i].0.clone(), i); + } + + //We compute a candidate for the new partition to zone + //assignation. + let nb_zones = part_per_zone.len(); + let nb_nodes = part_per_nod.len(); + let nb_partitions = 1<> = + old_node_assignation.iter().map(|x| x.iter().map( + |id| match *id { Some(i) => zone_id[&node_zone[i]] , + None => no_zone } + ).collect()).collect(); + + //We minimize the distance to the former zone assignation + zone_assignation = optimize_matching( + &old_zone_assignation, &zone_assignation, nb_zones+1); //+1 for no_zone + + //We need to assign partitions to nodes in their zone + //We first put the nodes assignation that can stay the same + for i in 0..nb_partitions{ + for j in 0..self.replication_factor { + if let Some(Some(former_node)) = old_node_assignation[i].iter().find( + |x| if let Some(id) = x { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + } + else {false} + ) + { + if part_per_nod[*former_node] > 0 { + node_assignation[i][j] = Some(*former_node); + part_per_nod[*former_node] -= 1; + } + } + } + } + + + //We complete the assignation of partitions to nodes + let mut rng = rand::thread_rng(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if node_assignation[i][j] == None { + let possible_nodes : Vec = (0..nb_nodes) + .filter( + |id| zone_id[&node_zone[*id]] == zone_assignation[i][j] + && part_per_nod[*id] > 0).collect(); + assert!(possible_nodes.len()>0); + //We randomly pick a node + if let Some(nod) = possible_nodes.choose(&mut rng){ + node_assignation[i][j] = Some(*nod); + part_per_nod[*nod] -= 1; + } + } + } + } + + //We write the assignation in the 1D table + self.ring_assignation_data = Vec::::new(); + for i in 0..nb_partitions{ + for j in 0..self.replication_factor { + if let Some(id) = node_assignation[i][j] { + self.ring_assignation_data.push(id as CompactNodeType); + } + else {assert!(false)} + } + } + + true + } + else { false } + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignation given by ring, with the new indices of the nodes, and + /// None of the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignation + /// do modify assignation_ring and node_id_vec. + fn update_nodes_and_ring(&mut self) -> Vec>> { + let nb_partitions = 1usize< = self.roles.items().iter() + .map(|(k, _, _)| *k) + .collect(); + + if ring.len() == rf*nb_partitions { + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + node_assignation[i][j] = new_node_id_vec.iter() + .position(|id| *id == self.node_id_vec[ring[i*rf + j] as usize]); + } + } + } + + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = vec![]; + return node_assignation; + } + + ///This function compute the number of partition to assign to + ///every node and zone, so that every partition is replicated + ///self.replication_factor times and the capacity of a partition + ///is maximized. + fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { + + let mut zone_capacity :HashMap= HashMap::new(); + + let (node_zone, node_capacity) = self.get_node_zone_capacity(); + let nb_nodes = self.node_id_vec.len(); + + for i in 0..nb_nodes + { + if zone_capacity.contains_key(&node_zone[i]) { + zone_capacity.insert(node_zone[i].clone(), zone_capacity[&node_zone[i]] + node_capacity[i]); + } + else{ + zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); + } + } + + //Compute the optimal number of partitions per zone + let sum_capacities: u32 =zone_capacity.values().sum(); + + if sum_capacities <= 0 { + println!("No storage capacity in the network."); + return None; + } + + let nb_partitions = 1< = + zone_capacity.iter() + .map(|(k, v)| (k.clone(), min(nb_partitions, + (self.replication_factor*nb_partitions + **v as usize)/sum_capacities as usize) ) ).collect(); + + //The replication_factor-1 upper bounds the number of + //part_per_zones that are greater than nb_partitions + for _ in 1..self.replication_factor { + //The number of partitions that are not assignated to + //a zone that takes nb_partitions. + let sum_capleft : u32 = zone_capacity.keys() + .filter(| k | {part_per_zone[*k] < nb_partitions} ) + .map(|k| zone_capacity[k]).sum(); + + //The number of replication of the data that we need + //to ensure. + let repl_left = self.replication_factor + - part_per_zone.values() + .filter(|x| {**x == nb_partitions}) + .count(); + if repl_left == 0 { + break; + } + + for k in zone_capacity.keys() { + if part_per_zone[k] != nb_partitions + { + part_per_zone.insert(k.to_string() , min(nb_partitions, + (nb_partitions*zone_capacity[k] as usize + *repl_left)/sum_capleft as usize)); + } + } + } + + //Now we divide the zone's partition share proportionally + //between their nodes. + + let mut part_per_nod : Vec = (0..nb_nodes).map( + |i| (part_per_zone[&node_zone[i]]*node_capacity[i] as usize)/zone_capacity[&node_zone[i]] as usize + ) + .collect(); + + //We must update the part_per_zone to make it correspond to + //part_per_nod (because of integer rounding) + part_per_zone = part_per_zone.iter().map(|(k,_)| + (k.clone(), 0)) + .collect(); + for i in 0..nb_nodes { + part_per_zone.insert( + node_zone[i].clone() , + part_per_zone[&node_zone[i]] + part_per_nod[i]); + } + + //Because of integer rounding, the total sum of part_per_nod + //might not be replication_factor*nb_partitions. + // We need at most to add 1 to every non maximal value of + // part_per_nod. The capacity of a partition will be bounded + // by the minimal value of + // node_capacity_vec[i]/part_per_nod[i] + // so we try to maximize this minimal value, keeping the + // part_per_zone capped + + let discrepancy : usize = + nb_partitions*self.replication_factor + - part_per_nod.iter().sum::(); + + //We use a stupid O(N^2) algorithm. If the number of nodes + //is actually expected to be high, one should optimize this. + + for _ in 0..discrepancy { + if let Some(idmax) = (0..nb_nodes) + .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) + .max_by( |i,j| + (node_capacity[*i]*(part_per_nod[*j]+1) as u32) + .cmp(&(node_capacity[*j]*(part_per_nod[*i]+1) as u32)) + ) + { + part_per_nod[idmax] += 1; + part_per_zone.insert(node_zone[idmax].clone(),part_per_zone[&node_zone[idmax]]+1); + } + } + + //We check the algorithm consistency + + let discrepancy : usize = + nb_partitions*self.replication_factor + - part_per_nod.iter().sum::(); + assert!(discrepancy == 0); + assert!(if let Some(v) = part_per_zone.values().max() + {*v <= nb_partitions} else {false} ); + + Some((part_per_nod, part_per_zone)) + } + + + //Returns vectors of zone and capacity; indexed by the same (temporary) + //indices as node_id_vec. + fn get_node_zone_capacity(& self) -> (Vec , Vec) { + + let node_zone = self.node_id_vec.iter().map( + |id_nod| match self.node_role(id_nod) { + Some(NodeRole{zone,capacity:_,tags:_}) => zone.clone() , + _ => "".to_string() + } + ).collect(); + + let node_capacity = self.node_id_vec.iter().map( + |id_nod| match self.node_role(id_nod) { + Some(NodeRole{zone:_,capacity,tags:_}) => + if let Some(c)=capacity + {*c} + else {0}, + _ => 0 + } + ).collect(); + + (node_zone,node_capacity) + } - fn configured_nodes_and_zones(&self) -> (Vec<(&Uuid, &NodeRole)>, HashSet<&str>) { - let configured_nodes = self - .roles - .items() - .iter() - .filter(|(_id, _, info)| info.0.is_some()) - .map(|(id, _, info)| (id, info.0.as_ref().unwrap())) - .collect::>(); - - let zones = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(_id, info)| info.zone.as_str()) - .collect::>(); - - (configured_nodes, zones) - } - - fn compute_assignation_data<'a>( - &self, - configured_nodes: &[(&'a Uuid, &'a NodeRole)], - partitions: &[PartitionAss<'a>], - ) -> (Vec, Vec) { - assert!(partitions.len() == (1 << PARTITION_BITS)); - - // Make a canonical order for nodes - let mut nodes = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(id, _)| **id) - .collect::>(); - let nodes_rev = nodes - .iter() - .enumerate() - .map(|(i, id)| (*id, i as CompactNodeType)) - .collect::>(); - - let mut assignation_data = vec![]; - for partition in partitions.iter() { - assert!(partition.nodes.len() == self.replication_factor); - for (id, _) in partition.nodes.iter() { - assignation_data.push(*nodes_rev.get(id).unwrap()); - } - } - - nodes.extend( - configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_none()) - .map(|(id, _)| **id), - ); - - (nodes, assignation_data) - } - - fn parse_assignation_data(&self) -> Vec> { - if self.ring_assignation_data.len() == self.replication_factor * (1 << PARTITION_BITS) { - // If the previous assignation data is correct, use that - let mut partitions = vec![]; - for i in 0..(1 << PARTITION_BITS) { - let mut part = PartitionAss::new(); - for node_i in self.ring_assignation_data - [i * self.replication_factor..(i + 1) * self.replication_factor] - .iter() - { - let node_id = &self.node_id_vec[*node_i as usize]; - - if let Some(NodeRoleV(Some(info))) = self.roles.get(node_id) { - part.nodes.push((node_id, Some(info))); - } else { - part.nodes.push((node_id, None)); - } - } - partitions.push(part); - } - partitions - } else { - // Otherwise start fresh - (0..(1 << PARTITION_BITS)) - .map(|_| PartitionAss::new()) - .collect() - } - } - - fn partitions_per_node<'a>(&self, partitions: &[PartitionAss<'a>]) -> HashMap<&'a Uuid, usize> { - let mut partitions_per_node = HashMap::<&Uuid, usize>::new(); - for p in partitions.iter() { - for (id, _) in p.nodes.iter() { - *partitions_per_node.entry(*id).or_insert(0) += 1; - } - } - partitions_per_node - } -} - -// ---- Internal structs for partition assignation in layout ---- - -#[derive(Clone)] -struct PartitionAss<'a> { - nodes: Vec<(&'a Uuid, Option<&'a NodeRole>)>, } -impl<'a> PartitionAss<'a> { - fn new() -> Self { - Self { nodes: Vec::new() } - } - fn nplus(&self, other: &PartitionAss<'a>) -> usize { - self.nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .count() - } - fn txtplus(&self, other: &PartitionAss<'a>) -> String { - let mut nodes = self - .nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .map(|x| format!("{:?}", x.0)) - .collect::>(); - nodes.sort(); - if self.nodes.iter().any(|x| other.nodes.contains(x)) { - nodes.push("...".into()); - } - format!("[{}]", nodes.join(" ")) - } +#[cfg(test)] +mod tests { + use super::*; + use itertools::Itertools; + + fn check_assignation(cl : &ClusterLayout) { + + //Check that input data has the right format + let nb_partitions = 1usize<>(); + + let zone_vec = node_zone.iter().unique().collect::>(); + let zone_nb_part = zone_vec.iter().map( |z| cl.ring_assignation_data.iter() + .filter(|x| node_zone[**x as usize] == **z) + .count() + ).collect::>(); + + //Check optimality of the zone assignation : would it be better for the + //node_capacity/node_partitions ratio to change the assignation of a partition + + if let Some(idmin) = (0..nb_nodes).min_by( + |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) + ){ + if let Some(idnew) = (0..nb_nodes) + .filter( |i| if let Some(p) = zone_vec.iter().position(|z| **z==node_zone[*i]) + {zone_nb_part[p] < nb_partitions } + else { false }) + .max_by( + |i,j| (node_capacity[*i]*(node_nb_part[*j]as u32+1)) + .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) + ){ + assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= + node_capacity[idnew]*node_nb_part[idmin] as u32); + } + + } + + //In every zone, check optimality of the nod assignation + for z in zone_vec { + let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z ); + if let Some(idmin) = node_of_z_iter.clone().min_by( + |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) + ){ + if let Some(idnew) = node_of_z_iter.min_by( + |i,j| (node_capacity[*i]*(node_nb_part[*j] as u32+1)) + .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) + ){ + assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= + node_capacity[idnew]*node_nb_part[idmin] as u32); + } + } + } + + } + + fn update_layout(cl : &mut ClusterLayout, node_id_vec : &Vec, + node_capacity_vec : &Vec , node_zone_vec : &Vec) { + for i in 0..node_id_vec.len(){ + if let Some(x) = FixedBytes32::try_from(&[i as u8;32]) { + cl.node_id_vec.push(x); + } + + let update = cl.roles.update_mutator(cl.node_id_vec[i] , + NodeRoleV(Some(NodeRole{ + zone : (node_zone_vec[i].to_string()), + capacity : (Some(node_capacity_vec[i])), + tags : (vec![])}))); + cl.roles.merge(&update); + } + } + + #[test] + fn test_assignation() { + + let mut node_id_vec = vec![1,2,3]; + let mut node_capacity_vec = vec![4000,1000,2000]; + let mut node_zone_vec= vec!["A", "B", "C"].into_iter().map(|x| x.to_string()).collect(); + + let mut cl = ClusterLayout { + node_id_vec: vec![], + + roles : LwwMap::new(), + + replication_factor: 3, + ring_assignation_data : vec![], + version:0, + staging: LwwMap::new(), + staging_hash: sha256sum(&[1;32]), + }; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_id_vec = vec![1,2,3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000,1000,1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec= vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"].into_iter().map(|x| x.to_string()).collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000,1000,2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + + node_capacity_vec = vec![4000,4000,2000, 7000, 1000, 9000, 2000, 10, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + } +} - fn is_valid_transition_to(&self, other: &PartitionAss<'a>, replication_factor: usize) -> bool { - let min_keep_nodes_per_part = (replication_factor + 1) / 2; - let n_removed = self.nplus(other); - if self.nodes.len() <= min_keep_nodes_per_part { - n_removed == 0 - } else { - n_removed <= self.nodes.len() - min_keep_nodes_per_part - } - } - // add is a key function in creating a PartitionAss, i.e. the list of nodes - // to which a partition is assigned. It tries to add a certain node id to the - // assignation, but checks that doing so is compatible with the NECESSARY - // condition that the partition assignation must be dispersed over different - // zones (datacenters) if enough zones exist. This is why it takes a n_zones - // parameter, which is the total number of zones that have existing nodes: - // if nodes in the assignation already cover all n_zones zones, then any node - // that is not yet in the assignation can be added. Otherwise, only nodes - // that are in a new zone can be added. - fn add( - &mut self, - target_len: Option, - n_zones: usize, - node: &'a Uuid, - role: &'a NodeRole, - ) -> bool { - if let Some(tl) = target_len { - if self.nodes.len() != tl - 1 { - return false; - } - } - - let p_zns = self - .nodes - .iter() - .map(|(_id, info)| info.unwrap().zone.as_str()) - .collect::>(); - if (p_zns.len() < n_zones && !p_zns.contains(&role.zone.as_str())) - || (p_zns.len() == n_zones && !self.nodes.iter().any(|(id, _)| *id == node)) - { - self.nodes.push((node, Some(role))); - true - } else { - false - } - } -} diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs new file mode 100644 index 00000000..aec7b042 --- /dev/null +++ b/src/util/bipartite.rs @@ -0,0 +1,378 @@ +/* + * This module deals with graph algorithm in complete bipartite + * graphs. It is used in layout.rs to build the partition to node + * assignation. + * */ + +use std::cmp::{min,max}; +use std::collections::VecDeque; +use rand::prelude::SliceRandom; + +//Graph data structure for the flow algorithm. +#[derive(Clone,Copy,Debug)] +struct EdgeFlow{ + c : i32, + flow : i32, + v : usize, + rev : usize, +} + +//Graph data structure for the detection of positive cycles. +#[derive(Clone,Copy,Debug)] +struct WeightedEdge{ + w : i32, + u : usize, + v : usize, +} + + +/* This function takes two matchings (old_match and new_match) in a + * complete bipartite graph. It returns a matching that has the + * same degree as new_match at every vertex, and that is as close + * as possible to old_match. + * */ +pub fn optimize_matching( old_match : &Vec> , + new_match : &Vec> , + nb_right : usize ) + -> Vec> { + let nb_left = old_match.len(); + let ed = WeightedEdge{w:-1,u:0,v:0}; + let mut edge_vec = vec![ed ; nb_left*nb_right]; + + //We build the complete bipartite graph structure, represented + //by the list of all edges. + for i in 0..nb_left { + for j in 0..nb_right{ + edge_vec[i*nb_right + j].u = i; + edge_vec[i*nb_right + j].v = nb_left+j; + } + } + + for i in 0..edge_vec.len() { + //We add the old matchings + if old_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { + edge_vec[i].w *= -1; + } + //We add the new matchings + if new_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { + (edge_vec[i].u,edge_vec[i].v) = + (edge_vec[i].v,edge_vec[i].u); + edge_vec[i].w *= -1; + } + } + //Now edge_vec is a graph where edges are oriented LR if we + //can add them to new_match, and RL otherwise. If + //adding/removing them makes the matching closer to old_match + //they have weight 1; and -1 otherwise. + + //We shuffle the edge list so that there is no bias depending in + //partitions/zone label in the triplet dispersion + let mut rng = rand::thread_rng(); + edge_vec.shuffle(&mut rng); + + //Discovering and flipping a cycle with positive weight in this + //graph will make the matching closer to old_match. + //We use Bellman Ford algorithm to discover positive cycles + loop{ + if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { + for i in cycle { + //We flip the edges of the cycle. + (edge_vec[i].u,edge_vec[i].v) = + (edge_vec[i].v,edge_vec[i].u); + edge_vec[i].w *= -1; + } + } + else { + //If there is no cycle, we return the optimal matching. + break; + } + } + + //The optimal matching is build from the graph structure. + let mut matching = vec![Vec::::new() ; nb_left]; + for e in edge_vec { + if e.u > e.v { + matching[e.v].push(e.u-nb_left); + } + } + matching +} + +//This function finds a positive cycle in a bipartite wieghted graph. +fn positive_cycle( edge_vec : &Vec, nb_left : usize, + nb_right : usize) -> Option> { + let nb_side_min = min(nb_left, nb_right); + let nb_vertices = nb_left+nb_right; + let weight_lowerbound = -((nb_left +nb_right) as i32) -1; + let mut accessed = vec![false ; nb_left]; + + //We try to find a positive cycle accessible from the left + //vertex i. + for i in 0..nb_left{ + if accessed[i] { + continue; + } + let mut weight =vec![weight_lowerbound ; nb_vertices]; + let mut prev =vec![ edge_vec.len() ; nb_vertices]; + weight[i] = 0; + //We compute largest weighted paths from i. + //Since the graph is bipartite, any simple cycle has length + //at most 2*nb_side_min. In the general Bellman-Ford + //algorithm, the bound here is the number of vertices. Since + //the number of partitions can be much larger than the + //number of nodes, we optimize that. + for _ in 0..(2*nb_side_min) { + for j in 0..edge_vec.len() { + let e = edge_vec[j]; + if weight[e.v] < weight[e.u]+e.w { + weight[e.v] = weight[e.u]+e.w; + prev[e.v] = j; + } + } + } + //We update the accessed table + for i in 0..nb_left { + if weight[i] > weight_lowerbound { + accessed[i] = true; + } + } + //We detect positive cycle + for e in edge_vec { + if weight[e.v] < weight[e.u]+e.w { + //it means e is on a path branching from a positive cycle + let mut was_seen = vec![false ; nb_vertices]; + let mut curr = e.u; + //We track back with prev until we reach the cycle. + while !was_seen[curr]{ + was_seen[curr] = true; + curr = edge_vec[prev[curr]].u; + } + //Now curr is on the cycle. We collect the edges ids. + let mut cycle = Vec::::new(); + cycle.push(prev[curr]); + let mut cycle_vert = edge_vec[prev[curr]].u; + while cycle_vert != curr { + cycle.push(prev[cycle_vert]); + cycle_vert = edge_vec[prev[cycle_vert]].u; + } + + return Some(cycle); + } + } + } + + None +} + + +// This function takes two arrays of capacity and computes the +// maximal matching in the complete bipartite graph such that the +// left vertex i is matched to left_cap_vec[i] right vertices, and +// the right vertex j is matched to right_cap_vec[j] left vertices. +// To do so, we use Dinic's maximum flow algorithm. +pub fn dinic_compute_matching( left_cap_vec : Vec, + right_cap_vec : Vec) -> Vec< Vec > +{ + let mut graph = Vec:: >::new(); + let ed = EdgeFlow{c:0,flow:0,v:0, rev:0}; + + // 0 will be the source + graph.push(vec![ed ; left_cap_vec.len()]); + for i in 0..left_cap_vec.len() + { + graph[0][i].c = left_cap_vec[i] as i32; + graph[0][i].v = i+2; + graph[0][i].rev = 0; + } + + //1 will be the sink + graph.push(vec![ed ; right_cap_vec.len()]); + for i in 0..right_cap_vec.len() + { + graph[1][i].c = right_cap_vec[i] as i32; + graph[1][i].v = i+2+left_cap_vec.len(); + graph[1][i].rev = 0; + } + + //we add left vertices + for i in 0..left_cap_vec.len() { + graph.push(vec![ed ; 1+right_cap_vec.len()]); + graph[i+2][0].c = 0; //directed + graph[i+2][0].v = 0; + graph[i+2][0].rev = i; + + for j in 0..right_cap_vec.len() { + graph[i+2][j+1].c = 1; + graph[i+2][j+1].v = 2+left_cap_vec.len()+j; + graph[i+2][j+1].rev = i+1; + } + } + + //we add right vertices + for i in 0..right_cap_vec.len() { + let lft_ln = left_cap_vec.len(); + graph.push(vec![ed ; 1+lft_ln]); + graph[i+lft_ln+2][0].c = graph[1][i].c; + graph[i+lft_ln+2][0].v = 1; + graph[i+lft_ln+2][0].rev = i; + + for j in 0..left_cap_vec.len() { + graph[i+2+lft_ln][j+1].c = 0; //directed + graph[i+2+lft_ln][j+1].v = j+2; + graph[i+2+lft_ln][j+1].rev = i+1; + } + } + + //To ensure the dispersion of the triplets generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //left vertices do not consider the right ones in the same order. + let mut rng = rand::thread_rng(); + for i in 0..graph.len() { + graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..graph[i].len() { + let target_v = graph[i][j].v; + let target_rev = graph[i][j].rev; + graph[target_v][target_rev].rev = j; + } + } + + let nb_vertices = graph.len(); + + //We run Dinic's max flow algorithm + loop{ + //We build the level array from Dinic's algorithm. + let mut level = vec![-1; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((0,0)); + while !fifo.is_empty() { + if let Some((id,lvl)) = fifo.pop_front(){ + if level[id] == -1 { + level[id] = lvl; + for e in graph[id].iter(){ + if e.c-e.flow > 0{ + fifo.push_back((e.v,lvl+1)); + } + } + } + } + } + if level[1] == -1 { + //There is no residual flow + break; + } + + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + let flow_upper_bound; + if let Some(x) = left_cap_vec.iter().max() { + flow_upper_bound=*x as i32; + } + else { + flow_upper_bound = 0; + assert!(false); + } + + lifo.push_back((0,flow_upper_bound)); + + loop + { + if let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == 1 { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id,_)) = lifo.pop_back(){ + let nbd=next_nbd[id]; + graph[id][nbd].flow += f; + let id_v = graph[id][nbd].v; + let nbd_v = graph[id][nbd].rev; + graph[id_v][nbd_v].flow -= f; + } + } + lifo.push_back((0,flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back(){ + next_nbd[*parent] +=1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f,graph[id][nbd].c + - graph[id][nbd].flow); + if level[graph[id][nbd].v] <= level[id] || + new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + //otherwise, we send flow to nbd. + lifo.push_back((graph[id][nbd].v, new_flow)); + } + else { + break; + } + } + } + + //We return the association + let assoc_table = (0..left_cap_vec.len()).map( + |id| graph[id+2].iter() + .filter(|e| e.flow > 0) + .map( |e| e.v-2-left_cap_vec.len()) + .collect()).collect(); + + //consistency check + + //it is a flow + for i in 3..graph.len(){ + assert!( graph[i].iter().map(|e| e.flow).sum::() == 0); + for e in graph[i].iter(){ + assert!(e.flow + graph[e.v][e.rev].flow == 0); + } + } + + //it solves the matching problem + for i in 0..left_cap_vec.len(){ + assert!(left_cap_vec[i] as i32 == + graph[i+2].iter().map(|e| max(0,e.flow)).sum::()); + } + for i in 0..right_cap_vec.len(){ + assert!(right_cap_vec[i] as i32 == + graph[i+2+left_cap_vec.len()].iter() + .map(|e| max(0,e.flow)).sum::()); + } + + + assoc_table +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_flow() { + let left_vec = vec![3;8]; + let right_vec = vec![0,4,8,4,8]; + //There are asserts in the function that computes the flow + let _ = dinic_compute_matching(left_vec, right_vec); + } + + //maybe add tests relative to the matching optilization ? +} + + diff --git a/src/util/lib.rs b/src/util/lib.rs index e83fc2e6..891549c3 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -4,6 +4,7 @@ extern crate tracing; pub mod background; +pub mod bipartite; pub mod config; pub mod crdt; pub mod data; -- cgit v1.2.3 From 2aeaddd5e2e1911b084f6d49ccb2236b7fec31af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 09:57:05 +0200 Subject: Apply cargo fmt --- src/rpc/layout.rs | 940 ++++++++++++++++++++++++++------------------------ src/util/bipartite.rs | 694 +++++++++++++++++++------------------ 2 files changed, 842 insertions(+), 792 deletions(-) (limited to 'src') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index afd7df17..ac31da72 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,12 +1,12 @@ +use std::cmp::min; use std::cmp::Ordering; -use std::cmp::{min}; -use std::collections::{HashMap}; +use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use garage_util::bipartite::*; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; -use garage_util::bipartite::*; use rand::prelude::SliceRandom; @@ -168,454 +168,506 @@ impl ClusterLayout { true } + /// This function calculates a new partition-to-node assignation. + /// The computed assignation maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignation, it minimizes the distance to + /// the former assignation (if any) to minimize the amount of + /// data to be moved. A heuristic ensures node triplets + /// dispersion (in garage_util::bipartite::optimize_matching()). + pub fn calculate_partition_assignation(&mut self) -> bool { + //The nodes might have been updated, some might have been deleted. + //So we need to first update the list of nodes and retrieve the + //assignation. + let old_node_assignation = self.update_nodes_and_ring(); + + let (node_zone, _) = self.get_node_zone_capacity(); + + //We compute the optimal number of partition to assign to + //every node and zone. + if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions() { + //We collect part_per_zone in a vec to not rely on the + //arbitrary order in which elements are iterated in + //Hashmap::iter() + let part_per_zone_vec = part_per_zone + .iter() + .map(|(x, y)| (x.clone(), *y)) + .collect::>(); + //We create an indexing of the zones + let mut zone_id = HashMap::::new(); + for i in 0..part_per_zone_vec.len() { + zone_id.insert(part_per_zone_vec[i].0.clone(), i); + } - /// This function calculates a new partition-to-node assignation. - /// The computed assignation maximizes the capacity of a - /// partition (assuming all partitions have the same size). - /// Among such optimal assignation, it minimizes the distance to - /// the former assignation (if any) to minimize the amount of - /// data to be moved. A heuristic ensures node triplets - /// dispersion (in garage_util::bipartite::optimize_matching()). - pub fn calculate_partition_assignation(&mut self) -> bool { - - //The nodes might have been updated, some might have been deleted. - //So we need to first update the list of nodes and retrieve the - //assignation. - let old_node_assignation = self.update_nodes_and_ring(); - - let (node_zone, _) = self.get_node_zone_capacity(); - - //We compute the optimal number of partition to assign to - //every node and zone. - if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions(){ - //We collect part_per_zone in a vec to not rely on the - //arbitrary order in which elements are iterated in - //Hashmap::iter() - let part_per_zone_vec = part_per_zone.iter() - .map(|(x,y)| (x.clone(),*y)) - .collect::>(); - //We create an indexing of the zones - let mut zone_id = HashMap::::new(); - for i in 0..part_per_zone_vec.len(){ - zone_id.insert(part_per_zone_vec[i].0.clone(), i); - } - - //We compute a candidate for the new partition to zone - //assignation. - let nb_zones = part_per_zone.len(); - let nb_nodes = part_per_nod.len(); - let nb_partitions = 1<> = - old_node_assignation.iter().map(|x| x.iter().map( - |id| match *id { Some(i) => zone_id[&node_zone[i]] , - None => no_zone } - ).collect()).collect(); - - //We minimize the distance to the former zone assignation - zone_assignation = optimize_matching( - &old_zone_assignation, &zone_assignation, nb_zones+1); //+1 for no_zone - - //We need to assign partitions to nodes in their zone - //We first put the nodes assignation that can stay the same - for i in 0..nb_partitions{ - for j in 0..self.replication_factor { - if let Some(Some(former_node)) = old_node_assignation[i].iter().find( - |x| if let Some(id) = x { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - } - else {false} - ) - { - if part_per_nod[*former_node] > 0 { - node_assignation[i][j] = Some(*former_node); - part_per_nod[*former_node] -= 1; - } - } - } - } - - - //We complete the assignation of partitions to nodes - let mut rng = rand::thread_rng(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if node_assignation[i][j] == None { - let possible_nodes : Vec = (0..nb_nodes) - .filter( - |id| zone_id[&node_zone[*id]] == zone_assignation[i][j] - && part_per_nod[*id] > 0).collect(); - assert!(possible_nodes.len()>0); - //We randomly pick a node - if let Some(nod) = possible_nodes.choose(&mut rng){ - node_assignation[i][j] = Some(*nod); - part_per_nod[*nod] -= 1; - } - } - } - } - - //We write the assignation in the 1D table - self.ring_assignation_data = Vec::::new(); - for i in 0..nb_partitions{ - for j in 0..self.replication_factor { - if let Some(id) = node_assignation[i][j] { - self.ring_assignation_data.push(id as CompactNodeType); - } - else {assert!(false)} - } - } - - true - } - else { false } - } - - /// The LwwMap of node roles might have changed. This function updates the node_id_vec - /// and returns the assignation given by ring, with the new indices of the nodes, and - /// None of the node is not present anymore. - /// We work with the assumption that only this function and calculate_new_assignation - /// do modify assignation_ring and node_id_vec. - fn update_nodes_and_ring(&mut self) -> Vec>> { - let nb_partitions = 1usize< = self.roles.items().iter() - .map(|(k, _, _)| *k) - .collect(); - - if ring.len() == rf*nb_partitions { - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - node_assignation[i][j] = new_node_id_vec.iter() - .position(|id| *id == self.node_id_vec[ring[i*rf + j] as usize]); - } - } - } - - self.node_id_vec = new_node_id_vec; - self.ring_assignation_data = vec![]; - return node_assignation; - } - - ///This function compute the number of partition to assign to - ///every node and zone, so that every partition is replicated - ///self.replication_factor times and the capacity of a partition - ///is maximized. - fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { - - let mut zone_capacity :HashMap= HashMap::new(); - - let (node_zone, node_capacity) = self.get_node_zone_capacity(); - let nb_nodes = self.node_id_vec.len(); - - for i in 0..nb_nodes - { - if zone_capacity.contains_key(&node_zone[i]) { - zone_capacity.insert(node_zone[i].clone(), zone_capacity[&node_zone[i]] + node_capacity[i]); - } - else{ - zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); - } - } - - //Compute the optimal number of partitions per zone - let sum_capacities: u32 =zone_capacity.values().sum(); - - if sum_capacities <= 0 { - println!("No storage capacity in the network."); - return None; - } - - let nb_partitions = 1< = - zone_capacity.iter() - .map(|(k, v)| (k.clone(), min(nb_partitions, - (self.replication_factor*nb_partitions - **v as usize)/sum_capacities as usize) ) ).collect(); - - //The replication_factor-1 upper bounds the number of - //part_per_zones that are greater than nb_partitions - for _ in 1..self.replication_factor { - //The number of partitions that are not assignated to - //a zone that takes nb_partitions. - let sum_capleft : u32 = zone_capacity.keys() - .filter(| k | {part_per_zone[*k] < nb_partitions} ) - .map(|k| zone_capacity[k]).sum(); - - //The number of replication of the data that we need - //to ensure. - let repl_left = self.replication_factor - - part_per_zone.values() - .filter(|x| {**x == nb_partitions}) - .count(); - if repl_left == 0 { - break; - } - - for k in zone_capacity.keys() { - if part_per_zone[k] != nb_partitions - { - part_per_zone.insert(k.to_string() , min(nb_partitions, - (nb_partitions*zone_capacity[k] as usize - *repl_left)/sum_capleft as usize)); - } - } - } - - //Now we divide the zone's partition share proportionally - //between their nodes. - - let mut part_per_nod : Vec = (0..nb_nodes).map( - |i| (part_per_zone[&node_zone[i]]*node_capacity[i] as usize)/zone_capacity[&node_zone[i]] as usize - ) - .collect(); - - //We must update the part_per_zone to make it correspond to - //part_per_nod (because of integer rounding) - part_per_zone = part_per_zone.iter().map(|(k,_)| - (k.clone(), 0)) - .collect(); - for i in 0..nb_nodes { - part_per_zone.insert( - node_zone[i].clone() , - part_per_zone[&node_zone[i]] + part_per_nod[i]); - } - - //Because of integer rounding, the total sum of part_per_nod - //might not be replication_factor*nb_partitions. - // We need at most to add 1 to every non maximal value of - // part_per_nod. The capacity of a partition will be bounded - // by the minimal value of - // node_capacity_vec[i]/part_per_nod[i] - // so we try to maximize this minimal value, keeping the - // part_per_zone capped - - let discrepancy : usize = - nb_partitions*self.replication_factor - - part_per_nod.iter().sum::(); - - //We use a stupid O(N^2) algorithm. If the number of nodes - //is actually expected to be high, one should optimize this. - - for _ in 0..discrepancy { - if let Some(idmax) = (0..nb_nodes) - .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) - .max_by( |i,j| - (node_capacity[*i]*(part_per_nod[*j]+1) as u32) - .cmp(&(node_capacity[*j]*(part_per_nod[*i]+1) as u32)) - ) - { - part_per_nod[idmax] += 1; - part_per_zone.insert(node_zone[idmax].clone(),part_per_zone[&node_zone[idmax]]+1); - } - } - - //We check the algorithm consistency - - let discrepancy : usize = - nb_partitions*self.replication_factor - - part_per_nod.iter().sum::(); - assert!(discrepancy == 0); - assert!(if let Some(v) = part_per_zone.values().max() - {*v <= nb_partitions} else {false} ); - - Some((part_per_nod, part_per_zone)) - } - - - //Returns vectors of zone and capacity; indexed by the same (temporary) - //indices as node_id_vec. - fn get_node_zone_capacity(& self) -> (Vec , Vec) { - - let node_zone = self.node_id_vec.iter().map( - |id_nod| match self.node_role(id_nod) { - Some(NodeRole{zone,capacity:_,tags:_}) => zone.clone() , - _ => "".to_string() - } - ).collect(); - - let node_capacity = self.node_id_vec.iter().map( - |id_nod| match self.node_role(id_nod) { - Some(NodeRole{zone:_,capacity,tags:_}) => - if let Some(c)=capacity - {*c} - else {0}, - _ => 0 - } - ).collect(); - - (node_zone,node_capacity) - } + //We compute a candidate for the new partition to zone + //assignation. + let nb_zones = part_per_zone.len(); + let nb_nodes = part_per_nod.len(); + let nb_partitions = 1 << PARTITION_BITS; + let left_cap_vec = vec![self.replication_factor as u32; nb_partitions]; + let right_cap_vec = part_per_zone_vec.iter().map(|(_, y)| *y as u32).collect(); + let mut zone_assignation = dinic_compute_matching(left_cap_vec, right_cap_vec); + + //We create the structure for the partition-to-node assignation. + let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; + //We will decrement part_per_nod to keep track of the number + //of partitions that we still have to associate. + let mut part_per_nod = part_per_nod.clone(); + + //We minimize the distance to the former assignation(if any) + + //We get the id of the zones of the former assignation + //(and the id no_zone if there is no node assignated) + let no_zone = part_per_zone_vec.len(); + let old_zone_assignation: Vec> = old_node_assignation + .iter() + .map(|x| { + x.iter() + .map(|id| match *id { + Some(i) => zone_id[&node_zone[i]], + None => no_zone, + }) + .collect() + }) + .collect(); + + //We minimize the distance to the former zone assignation + zone_assignation = + optimize_matching(&old_zone_assignation, &zone_assignation, nb_zones + 1); //+1 for no_zone + + //We need to assign partitions to nodes in their zone + //We first put the nodes assignation that can stay the same + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if let Some(Some(former_node)) = old_node_assignation[i].iter().find(|x| { + if let Some(id) = x { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + } else { + false + } + }) { + if part_per_nod[*former_node] > 0 { + node_assignation[i][j] = Some(*former_node); + part_per_nod[*former_node] -= 1; + } + } + } + } -} + //We complete the assignation of partitions to nodes + let mut rng = rand::thread_rng(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if node_assignation[i][j] == None { + let possible_nodes: Vec = (0..nb_nodes) + .filter(|id| { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + && part_per_nod[*id] > 0 + }) + .collect(); + assert!(possible_nodes.len() > 0); + //We randomly pick a node + if let Some(nod) = possible_nodes.choose(&mut rng) { + node_assignation[i][j] = Some(*nod); + part_per_nod[*nod] -= 1; + } + } + } + } + + //We write the assignation in the 1D table + self.ring_assignation_data = Vec::::new(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if let Some(id) = node_assignation[i][j] { + self.ring_assignation_data.push(id as CompactNodeType); + } else { + assert!(false) + } + } + } + true + } else { + false + } + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignation given by ring, with the new indices of the nodes, and + /// None of the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignation + /// do modify assignation_ring and node_id_vec. + fn update_nodes_and_ring(&mut self) -> Vec>> { + let nb_partitions = 1usize << PARTITION_BITS; + let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; + let rf = self.replication_factor; + let ring = &self.ring_assignation_data; + + let new_node_id_vec: Vec = self.roles.items().iter().map(|(k, _, _)| *k).collect(); + + if ring.len() == rf * nb_partitions { + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + node_assignation[i][j] = new_node_id_vec + .iter() + .position(|id| *id == self.node_id_vec[ring[i * rf + j] as usize]); + } + } + } + + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = vec![]; + return node_assignation; + } + + ///This function compute the number of partition to assign to + ///every node and zone, so that every partition is replicated + ///self.replication_factor times and the capacity of a partition + ///is maximized. + fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { + let mut zone_capacity: HashMap = HashMap::new(); + + let (node_zone, node_capacity) = self.get_node_zone_capacity(); + let nb_nodes = self.node_id_vec.len(); + + for i in 0..nb_nodes { + if zone_capacity.contains_key(&node_zone[i]) { + zone_capacity.insert( + node_zone[i].clone(), + zone_capacity[&node_zone[i]] + node_capacity[i], + ); + } else { + zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); + } + } + + //Compute the optimal number of partitions per zone + let sum_capacities: u32 = zone_capacity.values().sum(); + + if sum_capacities <= 0 { + println!("No storage capacity in the network."); + return None; + } + + let nb_partitions = 1 << PARTITION_BITS; + + //Initially we would like to use zones porportionally to + //their capacity. + //However, a large zone can be associated to at most + //nb_partitions to ensure replication of the date. + //So we take the min with nb_partitions: + let mut part_per_zone: HashMap = zone_capacity + .iter() + .map(|(k, v)| { + ( + k.clone(), + min( + nb_partitions, + (self.replication_factor * nb_partitions * *v as usize) + / sum_capacities as usize, + ), + ) + }) + .collect(); + + //The replication_factor-1 upper bounds the number of + //part_per_zones that are greater than nb_partitions + for _ in 1..self.replication_factor { + //The number of partitions that are not assignated to + //a zone that takes nb_partitions. + let sum_capleft: u32 = zone_capacity + .keys() + .filter(|k| part_per_zone[*k] < nb_partitions) + .map(|k| zone_capacity[k]) + .sum(); + + //The number of replication of the data that we need + //to ensure. + let repl_left = self.replication_factor + - part_per_zone + .values() + .filter(|x| **x == nb_partitions) + .count(); + if repl_left == 0 { + break; + } + + for k in zone_capacity.keys() { + if part_per_zone[k] != nb_partitions { + part_per_zone.insert( + k.to_string(), + min( + nb_partitions, + (nb_partitions * zone_capacity[k] as usize * repl_left) + / sum_capleft as usize, + ), + ); + } + } + } + + //Now we divide the zone's partition share proportionally + //between their nodes. + + let mut part_per_nod: Vec = (0..nb_nodes) + .map(|i| { + (part_per_zone[&node_zone[i]] * node_capacity[i] as usize) + / zone_capacity[&node_zone[i]] as usize + }) + .collect(); + + //We must update the part_per_zone to make it correspond to + //part_per_nod (because of integer rounding) + part_per_zone = part_per_zone.iter().map(|(k, _)| (k.clone(), 0)).collect(); + for i in 0..nb_nodes { + part_per_zone.insert( + node_zone[i].clone(), + part_per_zone[&node_zone[i]] + part_per_nod[i], + ); + } + + //Because of integer rounding, the total sum of part_per_nod + //might not be replication_factor*nb_partitions. + // We need at most to add 1 to every non maximal value of + // part_per_nod. The capacity of a partition will be bounded + // by the minimal value of + // node_capacity_vec[i]/part_per_nod[i] + // so we try to maximize this minimal value, keeping the + // part_per_zone capped + + let discrepancy: usize = + nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); + + //We use a stupid O(N^2) algorithm. If the number of nodes + //is actually expected to be high, one should optimize this. + + for _ in 0..discrepancy { + if let Some(idmax) = (0..nb_nodes) + .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) + .max_by(|i, j| { + (node_capacity[*i] * (part_per_nod[*j] + 1) as u32) + .cmp(&(node_capacity[*j] * (part_per_nod[*i] + 1) as u32)) + }) { + part_per_nod[idmax] += 1; + part_per_zone.insert( + node_zone[idmax].clone(), + part_per_zone[&node_zone[idmax]] + 1, + ); + } + } + //We check the algorithm consistency + + let discrepancy: usize = + nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); + assert!(discrepancy == 0); + assert!(if let Some(v) = part_per_zone.values().max() { + *v <= nb_partitions + } else { + false + }); + + Some((part_per_nod, part_per_zone)) + } + + //Returns vectors of zone and capacity; indexed by the same (temporary) + //indices as node_id_vec. + fn get_node_zone_capacity(&self) -> (Vec, Vec) { + let node_zone = self + .node_id_vec + .iter() + .map(|id_nod| match self.node_role(id_nod) { + Some(NodeRole { + zone, + capacity: _, + tags: _, + }) => zone.clone(), + _ => "".to_string(), + }) + .collect(); + + let node_capacity = self + .node_id_vec + .iter() + .map(|id_nod| match self.node_role(id_nod) { + Some(NodeRole { + zone: _, + capacity, + tags: _, + }) => { + if let Some(c) = capacity { + *c + } else { + 0 + } + } + _ => 0, + }) + .collect(); + + (node_zone, node_capacity) + } +} #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; - - fn check_assignation(cl : &ClusterLayout) { - - //Check that input data has the right format - let nb_partitions = 1usize<>(); - - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec.iter().map( |z| cl.ring_assignation_data.iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - ).collect::>(); - - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition - - if let Some(idmin) = (0..nb_nodes).min_by( - |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) - ){ - if let Some(idnew) = (0..nb_nodes) - .filter( |i| if let Some(p) = zone_vec.iter().position(|z| **z==node_zone[*i]) - {zone_nb_part[p] < nb_partitions } - else { false }) - .max_by( - |i,j| (node_capacity[*i]*(node_nb_part[*j]as u32+1)) - .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) - ){ - assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= - node_capacity[idnew]*node_nb_part[idmin] as u32); - } - - } - - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z ); - if let Some(idmin) = node_of_z_iter.clone().min_by( - |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) - ){ - if let Some(idnew) = node_of_z_iter.min_by( - |i,j| (node_capacity[*i]*(node_nb_part[*j] as u32+1)) - .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) - ){ - assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= - node_capacity[idnew]*node_nb_part[idmin] as u32); - } - } - } - - } - - fn update_layout(cl : &mut ClusterLayout, node_id_vec : &Vec, - node_capacity_vec : &Vec , node_zone_vec : &Vec) { - for i in 0..node_id_vec.len(){ - if let Some(x) = FixedBytes32::try_from(&[i as u8;32]) { - cl.node_id_vec.push(x); - } - - let update = cl.roles.update_mutator(cl.node_id_vec[i] , - NodeRoleV(Some(NodeRole{ - zone : (node_zone_vec[i].to_string()), - capacity : (Some(node_capacity_vec[i])), - tags : (vec![])}))); - cl.roles.merge(&update); - } - } - - #[test] - fn test_assignation() { - - let mut node_id_vec = vec![1,2,3]; - let mut node_capacity_vec = vec![4000,1000,2000]; - let mut node_zone_vec= vec!["A", "B", "C"].into_iter().map(|x| x.to_string()).collect(); - - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles : LwwMap::new(), - - replication_factor: 3, - ring_assignation_data : vec![], - version:0, - staging: LwwMap::new(), - staging_hash: sha256sum(&[1;32]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - node_id_vec = vec![1,2,3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000,1000,1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec= vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"].into_iter().map(|x| x.to_string()).collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - node_capacity_vec = vec![4000,1000,2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - - node_capacity_vec = vec![4000,4000,2000, 7000, 1000, 9000, 2000, 10, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - } -} + use super::*; + use itertools::Itertools; + + fn check_assignation(cl: &ClusterLayout) { + //Check that input data has the right format + let nb_partitions = 1usize << PARTITION_BITS; + assert!([1, 2, 3].contains(&cl.replication_factor)); + assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); + + let (node_zone, node_capacity) = cl.get_node_zone_capacity(); + + //Check that is is a correct assignation with zone redundancy + let rf = cl.replication_factor; + for i in 0..nb_partitions { + assert!( + rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] + .iter() + .map(|nod| node_zone[*nod as usize].clone()) + .unique() + .count() + ); + } + + let nb_nodes = cl.node_id_vec.len(); + //Check optimality + let node_nb_part = (0..nb_nodes) + .map(|i| { + cl.ring_assignation_data + .iter() + .filter(|x| **x == i as u8) + .count() + }) + .collect::>(); + + let zone_vec = node_zone.iter().unique().collect::>(); + let zone_nb_part = zone_vec + .iter() + .map(|z| { + cl.ring_assignation_data + .iter() + .filter(|x| node_zone[**x as usize] == **z) + .count() + }) + .collect::>(); + + //Check optimality of the zone assignation : would it be better for the + //node_capacity/node_partitions ratio to change the assignation of a partition + + if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { + (node_capacity[*i] * node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) + }) { + if let Some(idnew) = (0..nb_nodes) + .filter(|i| { + if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { + zone_nb_part[p] < nb_partitions + } else { + false + } + }) + .max_by(|i, j| { + (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) + .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) + }) { + assert!( + node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) + >= node_capacity[idnew] * node_nb_part[idmin] as u32 + ); + } + } + + //In every zone, check optimality of the nod assignation + for z in zone_vec { + let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); + if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { + (node_capacity[*i] * node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) + }) { + if let Some(idnew) = node_of_z_iter.min_by(|i, j| { + (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) + .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) + }) { + assert!( + node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) + >= node_capacity[idnew] * node_nb_part[idmin] as u32 + ); + } + } + } + } + + fn update_layout( + cl: &mut ClusterLayout, + node_id_vec: &Vec, + node_capacity_vec: &Vec, + node_zone_vec: &Vec, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); + } + let update = cl.roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.roles.merge(&update); + } + } + + #[test] + fn test_assignation() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = ClusterLayout { + node_id_vec: vec![], + roles: LwwMap::new(), + replication_factor: 3, + ring_assignation_data: vec![], + version: 0, + staging: LwwMap::new(), + staging_hash: sha256sum(&[1; 32]), + }; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000, 4000, 2000, 7000, 1000, 9000, 2000, 10, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + } +} diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs index aec7b042..ade831a4 100644 --- a/src/util/bipartite.rs +++ b/src/util/bipartite.rs @@ -1,378 +1,376 @@ /* - * This module deals with graph algorithm in complete bipartite + * This module deals with graph algorithm in complete bipartite * graphs. It is used in layout.rs to build the partition to node * assignation. * */ -use std::cmp::{min,max}; -use std::collections::VecDeque; use rand::prelude::SliceRandom; +use std::cmp::{max, min}; +use std::collections::VecDeque; //Graph data structure for the flow algorithm. -#[derive(Clone,Copy,Debug)] -struct EdgeFlow{ - c : i32, - flow : i32, - v : usize, - rev : usize, +#[derive(Clone, Copy, Debug)] +struct EdgeFlow { + c: i32, + flow: i32, + v: usize, + rev: usize, } //Graph data structure for the detection of positive cycles. -#[derive(Clone,Copy,Debug)] -struct WeightedEdge{ - w : i32, - u : usize, - v : usize, +#[derive(Clone, Copy, Debug)] +struct WeightedEdge { + w: i32, + u: usize, + v: usize, } - -/* This function takes two matchings (old_match and new_match) in a - * complete bipartite graph. It returns a matching that has the +/* This function takes two matchings (old_match and new_match) in a + * complete bipartite graph. It returns a matching that has the * same degree as new_match at every vertex, and that is as close * as possible to old_match. * */ -pub fn optimize_matching( old_match : &Vec> , - new_match : &Vec> , - nb_right : usize ) - -> Vec> { - let nb_left = old_match.len(); - let ed = WeightedEdge{w:-1,u:0,v:0}; - let mut edge_vec = vec![ed ; nb_left*nb_right]; - - //We build the complete bipartite graph structure, represented - //by the list of all edges. - for i in 0..nb_left { - for j in 0..nb_right{ - edge_vec[i*nb_right + j].u = i; - edge_vec[i*nb_right + j].v = nb_left+j; - } - } - - for i in 0..edge_vec.len() { - //We add the old matchings - if old_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { - edge_vec[i].w *= -1; - } - //We add the new matchings - if new_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { - (edge_vec[i].u,edge_vec[i].v) = - (edge_vec[i].v,edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - //Now edge_vec is a graph where edges are oriented LR if we - //can add them to new_match, and RL otherwise. If - //adding/removing them makes the matching closer to old_match - //they have weight 1; and -1 otherwise. - - //We shuffle the edge list so that there is no bias depending in - //partitions/zone label in the triplet dispersion - let mut rng = rand::thread_rng(); - edge_vec.shuffle(&mut rng); - - //Discovering and flipping a cycle with positive weight in this - //graph will make the matching closer to old_match. - //We use Bellman Ford algorithm to discover positive cycles - loop{ - if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { - for i in cycle { - //We flip the edges of the cycle. - (edge_vec[i].u,edge_vec[i].v) = - (edge_vec[i].v,edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - else { - //If there is no cycle, we return the optimal matching. - break; - } - } - - //The optimal matching is build from the graph structure. - let mut matching = vec![Vec::::new() ; nb_left]; - for e in edge_vec { - if e.u > e.v { - matching[e.v].push(e.u-nb_left); - } - } - matching +pub fn optimize_matching( + old_match: &Vec>, + new_match: &Vec>, + nb_right: usize, +) -> Vec> { + let nb_left = old_match.len(); + let ed = WeightedEdge { w: -1, u: 0, v: 0 }; + let mut edge_vec = vec![ed; nb_left * nb_right]; + + //We build the complete bipartite graph structure, represented + //by the list of all edges. + for i in 0..nb_left { + for j in 0..nb_right { + edge_vec[i * nb_right + j].u = i; + edge_vec[i * nb_right + j].v = nb_left + j; + } + } + + for i in 0..edge_vec.len() { + //We add the old matchings + if old_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { + edge_vec[i].w *= -1; + } + //We add the new matchings + if new_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { + (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); + edge_vec[i].w *= -1; + } + } + //Now edge_vec is a graph where edges are oriented LR if we + //can add them to new_match, and RL otherwise. If + //adding/removing them makes the matching closer to old_match + //they have weight 1; and -1 otherwise. + + //We shuffle the edge list so that there is no bias depending in + //partitions/zone label in the triplet dispersion + let mut rng = rand::thread_rng(); + edge_vec.shuffle(&mut rng); + + //Discovering and flipping a cycle with positive weight in this + //graph will make the matching closer to old_match. + //We use Bellman Ford algorithm to discover positive cycles + loop { + if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { + for i in cycle { + //We flip the edges of the cycle. + (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); + edge_vec[i].w *= -1; + } + } else { + //If there is no cycle, we return the optimal matching. + break; + } + } + + //The optimal matching is build from the graph structure. + let mut matching = vec![Vec::::new(); nb_left]; + for e in edge_vec { + if e.u > e.v { + matching[e.v].push(e.u - nb_left); + } + } + matching } //This function finds a positive cycle in a bipartite wieghted graph. -fn positive_cycle( edge_vec : &Vec, nb_left : usize, - nb_right : usize) -> Option> { - let nb_side_min = min(nb_left, nb_right); - let nb_vertices = nb_left+nb_right; - let weight_lowerbound = -((nb_left +nb_right) as i32) -1; - let mut accessed = vec![false ; nb_left]; - - //We try to find a positive cycle accessible from the left - //vertex i. - for i in 0..nb_left{ - if accessed[i] { - continue; - } - let mut weight =vec![weight_lowerbound ; nb_vertices]; - let mut prev =vec![ edge_vec.len() ; nb_vertices]; - weight[i] = 0; - //We compute largest weighted paths from i. - //Since the graph is bipartite, any simple cycle has length - //at most 2*nb_side_min. In the general Bellman-Ford - //algorithm, the bound here is the number of vertices. Since - //the number of partitions can be much larger than the - //number of nodes, we optimize that. - for _ in 0..(2*nb_side_min) { - for j in 0..edge_vec.len() { - let e = edge_vec[j]; - if weight[e.v] < weight[e.u]+e.w { - weight[e.v] = weight[e.u]+e.w; - prev[e.v] = j; - } - } - } - //We update the accessed table - for i in 0..nb_left { - if weight[i] > weight_lowerbound { - accessed[i] = true; - } - } - //We detect positive cycle - for e in edge_vec { - if weight[e.v] < weight[e.u]+e.w { - //it means e is on a path branching from a positive cycle - let mut was_seen = vec![false ; nb_vertices]; - let mut curr = e.u; - //We track back with prev until we reach the cycle. - while !was_seen[curr]{ - was_seen[curr] = true; - curr = edge_vec[prev[curr]].u; - } - //Now curr is on the cycle. We collect the edges ids. - let mut cycle = Vec::::new(); - cycle.push(prev[curr]); - let mut cycle_vert = edge_vec[prev[curr]].u; - while cycle_vert != curr { - cycle.push(prev[cycle_vert]); - cycle_vert = edge_vec[prev[cycle_vert]].u; - } - - return Some(cycle); - } - } - } - - None -} +fn positive_cycle( + edge_vec: &Vec, + nb_left: usize, + nb_right: usize, +) -> Option> { + let nb_side_min = min(nb_left, nb_right); + let nb_vertices = nb_left + nb_right; + let weight_lowerbound = -((nb_left + nb_right) as i32) - 1; + let mut accessed = vec![false; nb_left]; + //We try to find a positive cycle accessible from the left + //vertex i. + for i in 0..nb_left { + if accessed[i] { + continue; + } + let mut weight = vec![weight_lowerbound; nb_vertices]; + let mut prev = vec![edge_vec.len(); nb_vertices]; + weight[i] = 0; + //We compute largest weighted paths from i. + //Since the graph is bipartite, any simple cycle has length + //at most 2*nb_side_min. In the general Bellman-Ford + //algorithm, the bound here is the number of vertices. Since + //the number of partitions can be much larger than the + //number of nodes, we optimize that. + for _ in 0..(2 * nb_side_min) { + for j in 0..edge_vec.len() { + let e = edge_vec[j]; + if weight[e.v] < weight[e.u] + e.w { + weight[e.v] = weight[e.u] + e.w; + prev[e.v] = j; + } + } + } + //We update the accessed table + for i in 0..nb_left { + if weight[i] > weight_lowerbound { + accessed[i] = true; + } + } + //We detect positive cycle + for e in edge_vec { + if weight[e.v] < weight[e.u] + e.w { + //it means e is on a path branching from a positive cycle + let mut was_seen = vec![false; nb_vertices]; + let mut curr = e.u; + //We track back with prev until we reach the cycle. + while !was_seen[curr] { + was_seen[curr] = true; + curr = edge_vec[prev[curr]].u; + } + //Now curr is on the cycle. We collect the edges ids. + let mut cycle = Vec::::new(); + cycle.push(prev[curr]); + let mut cycle_vert = edge_vec[prev[curr]].u; + while cycle_vert != curr { + cycle.push(prev[cycle_vert]); + cycle_vert = edge_vec[prev[cycle_vert]].u; + } -// This function takes two arrays of capacity and computes the -// maximal matching in the complete bipartite graph such that the + return Some(cycle); + } + } + } + + None +} + +// This function takes two arrays of capacity and computes the +// maximal matching in the complete bipartite graph such that the // left vertex i is matched to left_cap_vec[i] right vertices, and // the right vertex j is matched to right_cap_vec[j] left vertices. // To do so, we use Dinic's maximum flow algorithm. -pub fn dinic_compute_matching( left_cap_vec : Vec, - right_cap_vec : Vec) -> Vec< Vec > -{ - let mut graph = Vec:: >::new(); - let ed = EdgeFlow{c:0,flow:0,v:0, rev:0}; - - // 0 will be the source - graph.push(vec![ed ; left_cap_vec.len()]); - for i in 0..left_cap_vec.len() - { - graph[0][i].c = left_cap_vec[i] as i32; - graph[0][i].v = i+2; - graph[0][i].rev = 0; - } - - //1 will be the sink - graph.push(vec![ed ; right_cap_vec.len()]); - for i in 0..right_cap_vec.len() - { - graph[1][i].c = right_cap_vec[i] as i32; - graph[1][i].v = i+2+left_cap_vec.len(); - graph[1][i].rev = 0; - } - - //we add left vertices - for i in 0..left_cap_vec.len() { - graph.push(vec![ed ; 1+right_cap_vec.len()]); - graph[i+2][0].c = 0; //directed - graph[i+2][0].v = 0; - graph[i+2][0].rev = i; - - for j in 0..right_cap_vec.len() { - graph[i+2][j+1].c = 1; - graph[i+2][j+1].v = 2+left_cap_vec.len()+j; - graph[i+2][j+1].rev = i+1; - } - } - - //we add right vertices - for i in 0..right_cap_vec.len() { - let lft_ln = left_cap_vec.len(); - graph.push(vec![ed ; 1+lft_ln]); - graph[i+lft_ln+2][0].c = graph[1][i].c; - graph[i+lft_ln+2][0].v = 1; - graph[i+lft_ln+2][0].rev = i; - - for j in 0..left_cap_vec.len() { - graph[i+2+lft_ln][j+1].c = 0; //directed - graph[i+2+lft_ln][j+1].v = j+2; - graph[i+2+lft_ln][j+1].rev = i+1; - } - } - - //To ensure the dispersion of the triplets generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //left vertices do not consider the right ones in the same order. - let mut rng = rand::thread_rng(); - for i in 0..graph.len() { - graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. - for j in 0..graph[i].len() { - let target_v = graph[i][j].v; - let target_rev = graph[i][j].rev; - graph[target_v][target_rev].rev = j; - } - } - - let nb_vertices = graph.len(); - - //We run Dinic's max flow algorithm - loop{ - //We build the level array from Dinic's algorithm. - let mut level = vec![-1; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((0,0)); - while !fifo.is_empty() { - if let Some((id,lvl)) = fifo.pop_front(){ - if level[id] == -1 { - level[id] = lvl; - for e in graph[id].iter(){ - if e.c-e.flow > 0{ - fifo.push_back((e.v,lvl+1)); - } - } - } - } - } - if level[1] == -1 { - //There is no residual flow - break; - } - - //Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); - - let flow_upper_bound; - if let Some(x) = left_cap_vec.iter().max() { - flow_upper_bound=*x as i32; - } - else { - flow_upper_bound = 0; - assert!(false); - } - - lifo.push_back((0,flow_upper_bound)); - - loop - { - if let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == 1 { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id,_)) = lifo.pop_back(){ - let nbd=next_nbd[id]; - graph[id][nbd].flow += f; - let id_v = graph[id][nbd].v; - let nbd_v = graph[id][nbd].rev; - graph[id_v][nbd_v].flow -= f; - } - } - lifo.push_back((0,flow_upper_bound)); - continue; - } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back(){ - next_nbd[*parent] +=1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f,graph[id][nbd].c - - graph[id][nbd].flow); - if level[graph[id][nbd].v] <= level[id] || - new_flow == 0 { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - //otherwise, we send flow to nbd. - lifo.push_back((graph[id][nbd].v, new_flow)); - } - else { - break; - } - } - } - - //We return the association - let assoc_table = (0..left_cap_vec.len()).map( - |id| graph[id+2].iter() - .filter(|e| e.flow > 0) - .map( |e| e.v-2-left_cap_vec.len()) - .collect()).collect(); - - //consistency check - - //it is a flow - for i in 3..graph.len(){ - assert!( graph[i].iter().map(|e| e.flow).sum::() == 0); - for e in graph[i].iter(){ - assert!(e.flow + graph[e.v][e.rev].flow == 0); - } - } - - //it solves the matching problem - for i in 0..left_cap_vec.len(){ - assert!(left_cap_vec[i] as i32 == - graph[i+2].iter().map(|e| max(0,e.flow)).sum::()); - } - for i in 0..right_cap_vec.len(){ - assert!(right_cap_vec[i] as i32 == - graph[i+2+left_cap_vec.len()].iter() - .map(|e| max(0,e.flow)).sum::()); - } - - - assoc_table -} +pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) -> Vec> { + let mut graph = Vec::>::new(); + let ed = EdgeFlow { + c: 0, + flow: 0, + v: 0, + rev: 0, + }; + + // 0 will be the source + graph.push(vec![ed; left_cap_vec.len()]); + for i in 0..left_cap_vec.len() { + graph[0][i].c = left_cap_vec[i] as i32; + graph[0][i].v = i + 2; + graph[0][i].rev = 0; + } + + //1 will be the sink + graph.push(vec![ed; right_cap_vec.len()]); + for i in 0..right_cap_vec.len() { + graph[1][i].c = right_cap_vec[i] as i32; + graph[1][i].v = i + 2 + left_cap_vec.len(); + graph[1][i].rev = 0; + } + + //we add left vertices + for i in 0..left_cap_vec.len() { + graph.push(vec![ed; 1 + right_cap_vec.len()]); + graph[i + 2][0].c = 0; //directed + graph[i + 2][0].v = 0; + graph[i + 2][0].rev = i; + + for j in 0..right_cap_vec.len() { + graph[i + 2][j + 1].c = 1; + graph[i + 2][j + 1].v = 2 + left_cap_vec.len() + j; + graph[i + 2][j + 1].rev = i + 1; + } + } + //we add right vertices + for i in 0..right_cap_vec.len() { + let lft_ln = left_cap_vec.len(); + graph.push(vec![ed; 1 + lft_ln]); + graph[i + lft_ln + 2][0].c = graph[1][i].c; + graph[i + lft_ln + 2][0].v = 1; + graph[i + lft_ln + 2][0].rev = i; + + for j in 0..left_cap_vec.len() { + graph[i + 2 + lft_ln][j + 1].c = 0; //directed + graph[i + 2 + lft_ln][j + 1].v = j + 2; + graph[i + 2 + lft_ln][j + 1].rev = i + 1; + } + } + + //To ensure the dispersion of the triplets generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //left vertices do not consider the right ones in the same order. + let mut rng = rand::thread_rng(); + for i in 0..graph.len() { + graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..graph[i].len() { + let target_v = graph[i][j].v; + let target_rev = graph[i][j].rev; + graph[target_v][target_rev].rev = j; + } + } + + let nb_vertices = graph.len(); + + //We run Dinic's max flow algorithm + loop { + //We build the level array from Dinic's algorithm. + let mut level = vec![-1; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((0, 0)); + while !fifo.is_empty() { + if let Some((id, lvl)) = fifo.pop_front() { + if level[id] == -1 { + level[id] = lvl; + for e in graph[id].iter() { + if e.c - e.flow > 0 { + fifo.push_back((e.v, lvl + 1)); + } + } + } + } + } + if level[1] == -1 { + //There is no residual flow + break; + } + + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + let flow_upper_bound; + if let Some(x) = left_cap_vec.iter().max() { + flow_upper_bound = *x as i32; + } else { + flow_upper_bound = 0; + assert!(false); + } + + lifo.push_back((0, flow_upper_bound)); + + loop { + if let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == 1 { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + graph[id][nbd].flow += f; + let id_v = graph[id][nbd].v; + let nbd_v = graph[id][nbd].rev; + graph[id_v][nbd_v].flow -= f; + } + } + lifo.push_back((0, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); + if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + //otherwise, we send flow to nbd. + lifo.push_back((graph[id][nbd].v, new_flow)); + } else { + break; + } + } + } + + //We return the association + let assoc_table = (0..left_cap_vec.len()) + .map(|id| { + graph[id + 2] + .iter() + .filter(|e| e.flow > 0) + .map(|e| e.v - 2 - left_cap_vec.len()) + .collect() + }) + .collect(); + + //consistency check + + //it is a flow + for i in 3..graph.len() { + assert!(graph[i].iter().map(|e| e.flow).sum::() == 0); + for e in graph[i].iter() { + assert!(e.flow + graph[e.v][e.rev].flow == 0); + } + } + + //it solves the matching problem + for i in 0..left_cap_vec.len() { + assert!(left_cap_vec[i] as i32 == graph[i + 2].iter().map(|e| max(0, e.flow)).sum::()); + } + for i in 0..right_cap_vec.len() { + assert!( + right_cap_vec[i] as i32 + == graph[i + 2 + left_cap_vec.len()] + .iter() + .map(|e| max(0, e.flow)) + .sum::() + ); + } + + assoc_table +} #[cfg(test)] mod tests { - use super::*; - - #[test] - fn test_flow() { - let left_vec = vec![3;8]; - let right_vec = vec![0,4,8,4,8]; - //There are asserts in the function that computes the flow - let _ = dinic_compute_matching(left_vec, right_vec); - } - - //maybe add tests relative to the matching optilization ? -} + use super::*; + #[test] + fn test_flow() { + let left_vec = vec![3; 8]; + let right_vec = vec![0, 4, 8, 4, 8]; + //There are asserts in the function that computes the flow + let _ = dinic_compute_matching(left_vec, right_vec); + } + //maybe add tests relative to the matching optilization ? +} -- cgit v1.2.3 From 948ff93cf10da1705766c2f0d256c316adcb806b Mon Sep 17 00:00:00 2001 From: Mendes Date: Sun, 1 May 2022 16:05:39 +0200 Subject: Corrected the warnings and errors issued by cargo clippy --- src/rpc/layout.rs | 26 +++++------ src/util/bipartite.rs | 117 ++++++++++++++++++++++---------------------------- 2 files changed, 63 insertions(+), 80 deletions(-) (limited to 'src') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index ac31da72..d0ee3463 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -195,8 +195,8 @@ impl ClusterLayout { .collect::>(); //We create an indexing of the zones let mut zone_id = HashMap::::new(); - for i in 0..part_per_zone_vec.len() { - zone_id.insert(part_per_zone_vec[i].0.clone(), i); + for (i, ppz) in part_per_zone_vec.iter().enumerate() { + zone_id.insert(ppz.0.clone(), i); } //We compute a candidate for the new partition to zone @@ -212,7 +212,7 @@ impl ClusterLayout { let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; //We will decrement part_per_nod to keep track of the number //of partitions that we still have to associate. - let mut part_per_nod = part_per_nod.clone(); + let mut part_per_nod = part_per_nod; //We minimize the distance to the former assignation(if any) @@ -265,7 +265,7 @@ impl ClusterLayout { && part_per_nod[*id] > 0 }) .collect(); - assert!(possible_nodes.len() > 0); + assert!(!possible_nodes.is_empty()); //We randomly pick a node if let Some(nod) = possible_nodes.choose(&mut rng) { node_assignation[i][j] = Some(*nod); @@ -277,12 +277,12 @@ impl ClusterLayout { //We write the assignation in the 1D table self.ring_assignation_data = Vec::::new(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if let Some(id) = node_assignation[i][j] { + for ass in node_assignation { + for nod in ass { + if let Some(id) = nod { self.ring_assignation_data.push(id as CompactNodeType); } else { - assert!(false) + panic!() } } } @@ -318,7 +318,7 @@ impl ClusterLayout { self.node_id_vec = new_node_id_vec; self.ring_assignation_data = vec![]; - return node_assignation; + node_assignation } ///This function compute the number of partition to assign to @@ -345,7 +345,7 @@ impl ClusterLayout { //Compute the optimal number of partitions per zone let sum_capacities: u32 = zone_capacity.values().sum(); - if sum_capacities <= 0 { + if sum_capacities == 0 { println!("No storage capacity in the network."); return None; } @@ -493,14 +493,10 @@ impl ClusterLayout { .map(|id_nod| match self.node_role(id_nod) { Some(NodeRole { zone: _, - capacity, + capacity: Some(c), tags: _, }) => { - if let Some(c) = capacity { *c - } else { - 0 - } } _ => 0, }) diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs index ade831a4..1e1e9caa 100644 --- a/src/util/bipartite.rs +++ b/src/util/bipartite.rs @@ -31,8 +31,8 @@ struct WeightedEdge { * as possible to old_match. * */ pub fn optimize_matching( - old_match: &Vec>, - new_match: &Vec>, + old_match: &[Vec], + new_match: &[Vec], nb_right: usize, ) -> Vec> { let nb_left = old_match.len(); @@ -72,16 +72,11 @@ pub fn optimize_matching( //Discovering and flipping a cycle with positive weight in this //graph will make the matching closer to old_match. //We use Bellman Ford algorithm to discover positive cycles - loop { - if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { - for i in cycle { - //We flip the edges of the cycle. - (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); - edge_vec[i].w *= -1; - } - } else { - //If there is no cycle, we return the optimal matching. - break; + while let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { + for i in cycle { + //We flip the edges of the cycle. + (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); + edge_vec[i].w *= -1; } } @@ -97,7 +92,7 @@ pub fn optimize_matching( //This function finds a positive cycle in a bipartite wieghted graph. fn positive_cycle( - edge_vec: &Vec, + edge_vec: &[WeightedEdge], nb_left: usize, nb_right: usize, ) -> Option> { @@ -122,8 +117,7 @@ fn positive_cycle( //the number of partitions can be much larger than the //number of nodes, we optimize that. for _ in 0..(2 * nb_side_min) { - for j in 0..edge_vec.len() { - let e = edge_vec[j]; + for (j, e) in edge_vec.iter().enumerate() { if weight[e.v] < weight[e.u] + e.w { weight[e.v] = weight[e.u] + e.w; prev[e.v] = j; @@ -148,8 +142,7 @@ fn positive_cycle( curr = edge_vec[prev[curr]].u; } //Now curr is on the cycle. We collect the edges ids. - let mut cycle = Vec::::new(); - cycle.push(prev[curr]); + let mut cycle = vec![prev[curr]]; let mut cycle_vert = edge_vec[prev[curr]].u; while cycle_vert != curr { cycle.push(prev[cycle_vert]); @@ -180,16 +173,16 @@ pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) - // 0 will be the source graph.push(vec![ed; left_cap_vec.len()]); - for i in 0..left_cap_vec.len() { - graph[0][i].c = left_cap_vec[i] as i32; + for (i, c) in left_cap_vec.iter().enumerate() { + graph[0][i].c = *c as i32; graph[0][i].v = i + 2; graph[0][i].rev = 0; } //1 will be the sink graph.push(vec![ed; right_cap_vec.len()]); - for i in 0..right_cap_vec.len() { - graph[1][i].c = right_cap_vec[i] as i32; + for (i, c) in right_cap_vec.iter().enumerate() { + graph[1][i].c = *c as i32; graph[1][i].v = i + 2 + left_cap_vec.len(); graph[1][i].rev = 0; } @@ -267,58 +260,52 @@ pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) - let mut next_nbd = vec![0; nb_vertices]; let mut lifo = VecDeque::new(); - let flow_upper_bound; - if let Some(x) = left_cap_vec.iter().max() { - flow_upper_bound = *x as i32; + let flow_upper_bound = if let Some(x) = left_cap_vec.iter().max() { + *x as i32 } else { - flow_upper_bound = 0; - assert!(false); - } + panic!(); + }; lifo.push_back((0, flow_upper_bound)); - loop { - if let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == 1 { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - graph[id][nbd].flow += f; - let id_v = graph[id][nbd].v; - let nbd_v = graph[id][nbd].rev; - graph[id_v][nbd_v].flow -= f; - } + while let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == 1 { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + graph[id][nbd].flow += f; + let id_v = graph[id][nbd].v; + let nbd_v = graph[id][nbd].rev; + graph[id_v][nbd_v].flow -= f; } - lifo.push_back((0, flow_upper_bound)); - continue; } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { - next_nbd[*parent] += 1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); - if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; + lifo.push_back((0, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; } - //otherwise, we send flow to nbd. - lifo.push_back((graph[id][nbd].v, new_flow)); - } else { - break; + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); + if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; } + //otherwise, we send flow to nbd. + lifo.push_back((graph[id][nbd].v, new_flow)); } } -- cgit v1.2.3 From 617f28bfa466d52fac7244f08b3a036ab4e8c9af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 5 May 2022 14:21:57 +0200 Subject: Correct small formatting issue --- src/rpc/layout.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'src') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d0ee3463..40f97368 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -495,9 +495,7 @@ impl ClusterLayout { zone: _, capacity: Some(c), tags: _, - }) => { - *c - } + }) => *c, _ => 0, }) .collect(); -- cgit v1.2.3 From 7f3249a23770fd4da981c2ecb1126da97e9b4ca5 Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 21 Sep 2022 14:39:59 +0200 Subject: New version of the algorithm that calculate the layout. It takes as paramters the replication factor and the zone redundancy, computes the largest partition size reachable with these constraints, and among the possible assignation with this partition size, it computes the one that moves the least number of partitions compared to the previous assignation. This computation uses graph algorithms defined in graph_algo.rs --- src/rpc/graph_algo.rs | 440 ++++++++++++++++++++++++++++ src/rpc/layout.rs | 795 ++++++++++++++++++++++++++++++-------------------- src/rpc/lib.rs | 2 + src/rpc/ring.rs | 1 + src/rpc/system.rs | 5 +- src/util/bipartite.rs | 363 ----------------------- src/util/lib.rs | 1 - 7 files changed, 918 insertions(+), 689 deletions(-) create mode 100644 src/rpc/graph_algo.rs delete mode 100644 src/util/bipartite.rs (limited to 'src') diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs new file mode 100644 index 00000000..1a809b80 --- /dev/null +++ b/src/rpc/graph_algo.rs @@ -0,0 +1,440 @@ + +//! This module deals with graph algorithms. +//! It is used in layout.rs to build the partition to node assignation. + +use rand::prelude::SliceRandom; +use std::cmp::{max, min}; +use std::collections::VecDeque; +use std::collections::HashMap; + +//Vertex data structures used in all the graphs used in layout.rs. +//usize parameters correspond to node/zone/partitions ids. +//To understand the vertex roles below, please refer to the formal description +//of the layout computation algorithm. +#[derive(Clone,Copy,Debug, PartialEq, Eq, Hash)] +pub enum Vertex{ + Source, + Pup(usize), //The vertex p+ of partition p + Pdown(usize), //The vertex p- of partition p + PZ(usize,usize), //The vertex corresponding to x_(partition p, zone z) + N(usize), //The vertex corresponding to node n + Sink +} + + +//Edge data structure for the flow algorithm. +//The graph is stored as an adjacency list +#[derive(Clone, Copy, Debug)] +pub struct FlowEdge { + cap: u32, //flow maximal capacity of the edge + flow: i32, //flow value on the edge + dest: usize, //destination vertex id + rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v +} + +//Edge data structure for the detection of negative cycles. +//The graph is stored as a list of edges (u,v). +#[derive(Clone, Copy, Debug)] +pub struct WeightedEdge { + w: i32, //weight of the edge + dest: usize, +} + +pub trait Edge: Clone + Copy {} +impl Edge for FlowEdge {} +impl Edge for WeightedEdge {} + +//Struct for the graph structure. We do encapsulation here to be able to both +//provide user friendly Vertex enum to address vertices, and to use usize indices +//and Vec instead of HashMap in the graph algorithm to optimize execution speed. +pub struct Graph{ + vertextoid : HashMap, + idtovertex : Vec, + + graph : Vec< Vec > +} + +pub type CostFunction = HashMap<(Vertex,Vertex), i32>; + +impl Graph{ + pub fn new(vertices : &[Vertex]) -> Self { + let mut map = HashMap::::new(); + for i in 0..vertices.len() { + map.insert(vertices[i] , i); + } + return Graph:: { + vertextoid : map, + idtovertex: vertices.to_vec(), + graph : vec![Vec::< E >::new(); vertices.len() ] + } + } +} + +impl Graph{ + //This function adds a directed edge to the graph with capacity c, and the + //corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v:Vertex, c: u32) -> Result<(), String>{ + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push( FlowEdge{cap: c , dest: idv , flow: 0, rev : rev_v} ); + self.graph[idv].push( FlowEdge{cap: 0 , dest: idu , flow: 0, rev : rev_u} ); + Ok(()) + } + + //This function returns the list of vertices that receive a positive flow from + //vertex v. + pub fn get_positive_flow_from(&self , v:Vertex) -> Result< Vec , String>{ + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = Vec::::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.idtovertex[edge.dest]); + } + } + return Ok(result); + } + + + //This function returns the value of the flow incoming to v. + pub fn get_inflow(&self , v:Vertex) -> Result< i32 , String>{ + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0,self.graph[edge.dest][edge.rev].flow); + } + return Ok(result); + } + + //This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self , v:Vertex) -> Result< i32 , String>{ + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0,edge.flow); + } + return Ok(result); + } + + //This function computes the flow total value by computing the outgoing flow + //from the source. + pub fn get_flow_value(&mut self) -> Result { + return self.get_outflow(Vertex::Source); + } + + //This function shuffles the order of the edge lists. It keeps the ids of the + //reversed edges consistent. + fn shuffle_edges(&mut self) { + let mut rng = rand::thread_rng(); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + //Computes an upper bound of the flow n the graph + pub fn flow_upper_bound(&self) -> u32{ + let idsource = self.vertextoid[&Vertex::Source]; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter(){ + flow_upper_bound += edge.cap; + } + return flow_upper_bound; + } + + //This function computes the maximal flow using Dinic's algorithm. It starts with + //the flow values already present in the graph. So it is possible to add some edge to + //the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + if !self.vertextoid.contains_key(&Vertex::Source) { + return Err("The graph does not contain a source.".to_string()); + } + if !self.vertextoid.contains_key(&Vertex::Sink) { + return Err("The graph does not contain a sink.".to_string()); + } + + let idsource = self.vertextoid[&Vertex::Source]; + let idsink = self.vertextoid[&Vertex::Sink]; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound(); + + //To ensure the dispersion of the associations generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + //We run Dinic's max flow algorithm + loop { + //We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while !fifo.is_empty() { + if let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { //it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i32 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + } + if level[idsink] == None { + //There is no residual flow + break; + } + + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + lifo.push_back((idsource, flow_upper_bound)); + + while let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == idsink { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i32; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i32; + } + } + lifo.push_back((idsource, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f, self.graph[id][nbd].cap - self.graph[id][nbd].flow as u32 ); + if let (Some(lvldest), Some(lvlid)) = + (level[self.graph[id][nbd].dest], level[id]){ + if lvldest <= lvlid || new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + //otherwise, we send flow to nbd. + lifo.push_back((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + //This function takes a flow, and a cost function on the edges, and tries to find an + // equivalent flow with a better cost, by finding improving overflow cycles. It uses + // as subroutine the Bellman Ford algorithm run up to path_length. + // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only + // one needs to be present in the cost function. + pub fn optimize_flow_with_cost(&mut self , cost: &CostFunction, path_length: usize ) + -> Result<(),String>{ + + //We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while cycles.len() > 0 { + //we enumerate negative cycles + for c in cycles.iter(){ + for i in 0..c.len(){ + //We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertextoid[&c[i]]; + let idv = self.vertextoid[&c[(i+1)%c.len()]]; + for j in 0..self.graph[idu].len(){ + //since idu appears at most once in the cycles, we enumerate every + //edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -=1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + return Ok(()); + } + + //Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self , cost: &CostFunction) -> Result,String>{ + + let mut g = Graph::::new(&self.idtovertex); + let nb_vertices = self.idtovertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i32 -edge.flow > 0 { + //It is possible to send overflow through this edge + let u = self.idtovertex[i]; + let v = self.idtovertex[edge.dest]; + if cost.contains_key(&(u,v)) { + g.add_edge(u,v, cost[&(u,v)])?; + } + else if cost.contains_key(&(v,u)) { + g.add_edge(u,v, -cost[&(v,u)])?; + } + else{ + g.add_edge(u,v, 0)?; + } + } + } + } + return Ok(g); + + } + + +} + +impl Graph{ + //This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v:Vertex, w: i32) -> Result<(), String>{ + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + self.graph[idu].push( WeightedEdge{w: w , dest: idv} ); + Ok(()) + } + + //This function lists the negative cycles it manages to find after path_length + //iterations of the main loop of the Bellman-Ford algorithm. For the classical + //algorithm, path_length needs to be equal to the number of vertices. However, + //for particular graph structures like our case, the algorithm is still correct + //when path_length is the length of the longest possible simple path. + //See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec< Vec > { + + let nb_vertices = self.graph.len(); + + //We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0 ; nb_vertices]; + //The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length +1 { + for id in 0..nb_vertices{ + for e in self.graph[id].iter(){ + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + //If self.graph contains a negative cycle, then at this point the graph described + //by prev (which is a directed 1-forest/functional graph) + //must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + //Remark that the cycle in prev is in the reverse order compared to the cycle + //in the graph. Thus the .rev(). + return cycles_prev.iter().map(|cycle| cycle.iter().rev().map( + |id| self.idtovertex[*id] + ).collect() ).collect(); + } + +} + + +//This function returns the list of cycles of a directed 1 forest. It does not +//check for the consistency of the input. +fn cycles_of_1_forest(forest: &[Option]) -> Vec> { + let mut cycles = Vec::>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len(){ + let mut id = t; + //while we are on a valid undiscovered node + while time_of_discovery[id] == None { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } + else{ + break; + } + } + if forest[id] != None && time_of_discovery[id] == Some(t) { + //We discovered an id that we explored at this iteration t. + //It means we are on a cycle + let mut cy = vec![id; 1]; + let id2 = id; + while let Some(id2) = forest[id2] { + if id2 != id { + cy.push(id2); + } + else { + break; + } + } + cycles.push(cy); + } + } + return cycles; +} + + +//==================================================================================== +//==================================================================================== +//==================================================================================== +//==================================================================================== +//==================================================================================== +//==================================================================================== + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_flow() { + let left_vec = vec![3; 8]; + let right_vec = vec![0, 4, 8, 4, 8]; + //There are asserts in the function that computes the flow + } + + //maybe add tests relative to the matching optilization ? +} diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 40f97368..ff60ce98 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,17 +1,23 @@ -use std::cmp::min; use std::cmp::Ordering; use std::collections::HashMap; +use std::collections::HashSet; + +use hex::ToHex; use serde::{Deserialize, Serialize}; -use garage_util::bipartite::*; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; -use rand::prelude::SliceRandom; +use crate::graph_algo::*; use crate::ring::*; +use std::convert::TryInto; + +//The Message type will be used to collect information on the algorithm. +type Message = Vec; + /// The layout of the cluster, i.e. the list of roles /// which are assigned to each cluster node #[derive(Clone, Debug, Serialize, Deserialize)] @@ -19,12 +25,21 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, + #[serde(default="default_one")] + pub zone_redundancy: usize, + + //This attribute is only used to retain the previously computed partition size, + //to know to what extent does it change with the layout update. + #[serde(default="default_zero")] + pub partition_size: u32, + pub roles: LwwMap, /// node_id_vec: a vector of node IDs with a role assigned /// in the system (this includes gateway nodes). /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers + /// 1. non-gateway nodes are first so that they have lower numbers holding + /// in u8 (the number of non-gateway nodes is at most 256). /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, @@ -38,6 +53,15 @@ pub struct ClusterLayout { pub staging_hash: Hash, } +fn default_one() -> usize{ + return 1; +} +fn default_zero() -> u32{ + return 0; +} + +const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -66,16 +90,31 @@ impl NodeRole { None => "gateway".to_string(), } } + + pub fn tags_string(&self) -> String { + let mut tags = String::new(); + if self.tags.len() == 0 { + return tags + } + tags.push_str(&self.tags[0].clone()); + for t in 1..self.tags.len(){ + tags.push_str(","); + tags.push_str(&self.tags[t].clone()); + } + return tags; + } } impl ClusterLayout { - pub fn new(replication_factor: usize) -> Self { + pub fn new(replication_factor: usize, zone_redundancy: usize) -> Self { let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); ClusterLayout { version: 0, replication_factor, + zone_redundancy, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), @@ -122,6 +161,44 @@ impl ClusterLayout { } } + ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn useful_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => () + } + } + return result; + } + + ///Given a node uuids, this function returns the label of its zone + pub fn get_node_zone(&self, uuid : &Uuid) -> Result { + match self.node_role(uuid) { + Some(role) => return Ok(role.zone.clone()), + _ => return Err("The Uuid does not correspond to a node present in the cluster.".to_string()) + } + } + + ///Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), + _ => return Err("The Uuid does not correspond to a node present in the cluster or this node does not have a positive capacity.".to_string()) + } + } + + ///Returns the sum of capacities of non gateway nodes in the cluster + pub fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.useful_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + return Ok(total_capacity); + } + + /// Check a cluster layout for internal consistency /// returns true if consistent, false if error pub fn check(&self) -> bool { @@ -168,342 +245,412 @@ impl ClusterLayout { true } +} + +impl ClusterLayout { /// This function calculates a new partition-to-node assignation. - /// The computed assignation maximizes the capacity of a + /// The computed assignation respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of - /// data to be moved. A heuristic ensures node triplets - /// dispersion (in garage_util::bipartite::optimize_matching()). - pub fn calculate_partition_assignation(&mut self) -> bool { + /// data to be moved. + pub fn calculate_partition_assignation(&mut self, replication:usize, redundancy:usize) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - let old_node_assignation = self.update_nodes_and_ring(); - - let (node_zone, _) = self.get_node_zone_capacity(); - - //We compute the optimal number of partition to assign to - //every node and zone. - if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions() { - //We collect part_per_zone in a vec to not rely on the - //arbitrary order in which elements are iterated in - //Hashmap::iter() - let part_per_zone_vec = part_per_zone - .iter() - .map(|(x, y)| (x.clone(), *y)) - .collect::>(); - //We create an indexing of the zones - let mut zone_id = HashMap::::new(); - for (i, ppz) in part_per_zone_vec.iter().enumerate() { - zone_id.insert(ppz.0.clone(), i); - } - - //We compute a candidate for the new partition to zone - //assignation. - let nb_zones = part_per_zone.len(); - let nb_nodes = part_per_nod.len(); - let nb_partitions = 1 << PARTITION_BITS; - let left_cap_vec = vec![self.replication_factor as u32; nb_partitions]; - let right_cap_vec = part_per_zone_vec.iter().map(|(_, y)| *y as u32).collect(); - let mut zone_assignation = dinic_compute_matching(left_cap_vec, right_cap_vec); - - //We create the structure for the partition-to-node assignation. - let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; - //We will decrement part_per_nod to keep track of the number - //of partitions that we still have to associate. - let mut part_per_nod = part_per_nod; - - //We minimize the distance to the former assignation(if any) - - //We get the id of the zones of the former assignation - //(and the id no_zone if there is no node assignated) - let no_zone = part_per_zone_vec.len(); - let old_zone_assignation: Vec> = old_node_assignation - .iter() - .map(|x| { - x.iter() - .map(|id| match *id { - Some(i) => zone_id[&node_zone[i]], - None => no_zone, - }) - .collect() - }) - .collect(); - - //We minimize the distance to the former zone assignation - zone_assignation = - optimize_matching(&old_zone_assignation, &zone_assignation, nb_zones + 1); //+1 for no_zone - - //We need to assign partitions to nodes in their zone - //We first put the nodes assignation that can stay the same - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if let Some(Some(former_node)) = old_node_assignation[i].iter().find(|x| { - if let Some(id) = x { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - } else { - false - } - }) { - if part_per_nod[*former_node] > 0 { - node_assignation[i][j] = Some(*former_node); - part_per_nod[*former_node] -= 1; - } - } - } - } - - //We complete the assignation of partitions to nodes - let mut rng = rand::thread_rng(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if node_assignation[i][j] == None { - let possible_nodes: Vec = (0..nb_nodes) - .filter(|id| { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - && part_per_nod[*id] > 0 - }) - .collect(); - assert!(!possible_nodes.is_empty()); - //We randomly pick a node - if let Some(nod) = possible_nodes.choose(&mut rng) { - node_assignation[i][j] = Some(*nod); - part_per_nod[*nod] -= 1; - } - } - } - } - - //We write the assignation in the 1D table - self.ring_assignation_data = Vec::::new(); - for ass in node_assignation { - for nod in ass { - if let Some(id) = nod { - self.ring_assignation_data.push(id as CompactNodeType); - } else { - panic!() - } - } - } - - true - } else { - false - } - } + + //We update the node ids, since the node list might have changed with the staged + //changes in the layout. We retrieve the old_assignation reframed with the new ids + let old_assignation_opt = self.update_node_id_vec()?; + self.replication_factor = replication; + self.zone_redundancy = redundancy; + + let mut msg = Message::new(); + msg.push(format!("Computation of a new cluster layout where partitions are + replicated {} times on at least {} distinct zones.", replication, redundancy)); + + //We generate for once numerical ids for the zone, to use them as indices in the + //flow graphs. + let (id_to_zone , zone_to_id) = self.generate_zone_ids()?; + + msg.push(format!("The cluster contains {} nodes spread over {} zones.", + self.useful_nodes().len(), id_to_zone.len())); + + //We compute the optimal partition size + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + if old_assignation_opt != None { + msg.push(format!("Given the replication and redundancy constraint, the + optimal size of a partition is {}. In the previous layout, it used to + be {}.", partition_size, self.partition_size)); + } + else { + msg.push(format!("Given the replication and redundancy constraints, the + optimal size of a partition is {}.", partition_size)); + } + self.partition_size = partition_size; + + //We compute a first flow/assignment that is heuristically close to the previous + //assignment + let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; + + if let Some(assoc) = &old_assignation_opt { + //We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; + } + + msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); + + //We update the layout structure + self.update_ring_from_flow(id_to_zone.len() , &gflow)?; + return Ok(msg); + } /// The LwwMap of node roles might have changed. This function updates the node_id_vec /// and returns the assignation given by ring, with the new indices of the nodes, and - /// None of the node is not present anymore. + /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_nodes_and_ring(&mut self) -> Vec>> { - let nb_partitions = 1usize << PARTITION_BITS; - let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; - let rf = self.replication_factor; - let ring = &self.ring_assignation_data; - - let new_node_id_vec: Vec = self.roles.items().iter().map(|(k, _, _)| *k).collect(); - - if ring.len() == rf * nb_partitions { - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - node_assignation[i][j] = new_node_id_vec - .iter() - .position(|id| *id == self.node_id_vec[ring[i * rf + j] as usize]); - } - } - } - - self.node_id_vec = new_node_id_vec; - self.ring_assignation_data = vec![]; - node_assignation + fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,String> { + // (1) We compute the new node list + //Non gateway nodes should be coded on 8bits, hence they must be first in the list + //We build the new node ids + let mut new_non_gateway_nodes: Vec = self.roles.items().iter() + .filter(|(_, _, v)| + match &v.0 {Some(r) if r.capacity != None => true, _=> false }) + .map(|(k, _, _)| *k).collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(format!("There are more than {} non-gateway nodes in the new layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + } + + let mut new_gateway_nodes: Vec = self.roles.items().iter() + .filter(|(_, _, v)| + match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) + .map(|(k, _, _)| *k).collect(); + + let nb_useful_nodes = new_non_gateway_nodes.len(); + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.append(&mut new_non_gateway_nodes); + new_node_id_vec.append(&mut new_gateway_nodes); + + + // (2) We retrieve the old association + //We rewrite the old association with the new indices. We only consider partition + //to node assignations where the node is still in use. + let nb_partitions = 1usize << PARTITION_BITS; + let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; + + if self.ring_assignation_data.len() == 0 { + //This is a new association + return Ok(None); + } + if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + return Err("The old assignation does not have a size corresponding to the old replication factor or the number of partitions.".to_string()); + } + + //We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + //We add the indices of only the new non-gateway nodes that can be used in the + //association ring + for i in 0..nb_useful_nodes { + uuid_to_new_id.insert(new_node_id_vec[i], i ); + } + + let rf= self.replication_factor; + for p in 0..nb_partitions { + for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { + let uuid = self.node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assignation[p].push(uuid_to_new_id[&uuid]); + } + } + } + + //We write the results + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = Vec::::new(); + + return Ok(Some(old_assignation)); } - ///This function compute the number of partition to assign to - ///every node and zone, so that every partition is replicated - ///self.replication_factor times and the capacity of a partition - ///is maximized. - fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { - let mut zone_capacity: HashMap = HashMap::new(); - - let (node_zone, node_capacity) = self.get_node_zone_capacity(); - let nb_nodes = self.node_id_vec.len(); - - for i in 0..nb_nodes { - if zone_capacity.contains_key(&node_zone[i]) { - zone_capacity.insert( - node_zone[i].clone(), - zone_capacity[&node_zone[i]] + node_capacity[i], - ); - } else { - zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); - } - } - - //Compute the optimal number of partitions per zone - let sum_capacities: u32 = zone_capacity.values().sum(); - - if sum_capacities == 0 { - println!("No storage capacity in the network."); - return None; - } - - let nb_partitions = 1 << PARTITION_BITS; - //Initially we would like to use zones porportionally to - //their capacity. - //However, a large zone can be associated to at most - //nb_partitions to ensure replication of the date. - //So we take the min with nb_partitions: - let mut part_per_zone: HashMap = zone_capacity - .iter() - .map(|(k, v)| { - ( - k.clone(), - min( - nb_partitions, - (self.replication_factor * nb_partitions * *v as usize) - / sum_capacities as usize, - ), - ) - }) - .collect(); - - //The replication_factor-1 upper bounds the number of - //part_per_zones that are greater than nb_partitions - for _ in 1..self.replication_factor { - //The number of partitions that are not assignated to - //a zone that takes nb_partitions. - let sum_capleft: u32 = zone_capacity - .keys() - .filter(|k| part_per_zone[*k] < nb_partitions) - .map(|k| zone_capacity[k]) - .sum(); - - //The number of replication of the data that we need - //to ensure. - let repl_left = self.replication_factor - - part_per_zone - .values() - .filter(|x| **x == nb_partitions) - .count(); - if repl_left == 0 { - break; - } - - for k in zone_capacity.keys() { - if part_per_zone[k] != nb_partitions { - part_per_zone.insert( - k.to_string(), - min( - nb_partitions, - (nb_partitions * zone_capacity[k] as usize * repl_left) - / sum_capleft as usize, - ), - ); - } - } - } - - //Now we divide the zone's partition share proportionally - //between their nodes. - - let mut part_per_nod: Vec = (0..nb_nodes) - .map(|i| { - (part_per_zone[&node_zone[i]] * node_capacity[i] as usize) - / zone_capacity[&node_zone[i]] as usize - }) - .collect(); - - //We must update the part_per_zone to make it correspond to - //part_per_nod (because of integer rounding) - part_per_zone = part_per_zone.iter().map(|(k, _)| (k.clone(), 0)).collect(); - for i in 0..nb_nodes { - part_per_zone.insert( - node_zone[i].clone(), - part_per_zone[&node_zone[i]] + part_per_nod[i], - ); - } - - //Because of integer rounding, the total sum of part_per_nod - //might not be replication_factor*nb_partitions. - // We need at most to add 1 to every non maximal value of - // part_per_nod. The capacity of a partition will be bounded - // by the minimal value of - // node_capacity_vec[i]/part_per_nod[i] - // so we try to maximize this minimal value, keeping the - // part_per_zone capped - - let discrepancy: usize = - nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); - - //We use a stupid O(N^2) algorithm. If the number of nodes - //is actually expected to be high, one should optimize this. - - for _ in 0..discrepancy { - if let Some(idmax) = (0..nb_nodes) - .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) - .max_by(|i, j| { - (node_capacity[*i] * (part_per_nod[*j] + 1) as u32) - .cmp(&(node_capacity[*j] * (part_per_nod[*i] + 1) as u32)) - }) { - part_per_nod[idmax] += 1; - part_per_zone.insert( - node_zone[idmax].clone(), - part_per_zone[&node_zone[idmax]] + 1, - ); - } - } - - //We check the algorithm consistency - - let discrepancy: usize = - nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); - assert!(discrepancy == 0); - assert!(if let Some(v) = part_per_zone.values().max() { - *v <= nb_partitions - } else { - false - }); - - Some((part_per_nod, part_per_zone)) - } - - //Returns vectors of zone and capacity; indexed by the same (temporary) - //indices as node_id_vec. - fn get_node_zone_capacity(&self) -> (Vec, Vec) { - let node_zone = self - .node_id_vec - .iter() - .map(|id_nod| match self.node_role(id_nod) { - Some(NodeRole { - zone, - capacity: _, - tags: _, - }) => zone.clone(), - _ => "".to_string(), - }) - .collect(); - - let node_capacity = self - .node_id_vec - .iter() - .map(|id_nod| match self.node_role(id_nod) { - Some(NodeRole { - zone: _, - capacity: Some(c), - tags: _, - }) => *c, - _ => 0, - }) - .collect(); - - (node_zone, node_capacity) - } + ///This function generates ids for the zone of the nodes appearing in + ///self.node_id_vec. + fn generate_zone_ids(&self) -> Result<(Vec, HashMap),String>{ + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.node_id_vec.iter() { + if self.roles.get(uuid) == None { + return Err("The uuid was not found in the node roles (this should not happen, it might be a critical error).".to_string()); + } + match self.node_role(&uuid) { + Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone() , id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + _ => () + } + } + return Ok((id_to_zone, zone_to_id)); + } + + ///This function computes by dichotomy the largest realizable partition size, given + ///the layout. + fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ + let nb_partitions = 1usize << PARTITION_BITS; + let empty_set = HashSet::<(usize,usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { + return Err("The storage capacity of he cluster is to small. It is impossible to store partitions of size 1.".to_string()); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down +1 < s_up { + g = self.generate_flow_graph((s_down+s_up)/2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { + s_up = (s_down+s_up)/2; + } + else { + s_down = (s_down+s_up)/2; + } + } + + return Ok(s_down); + } + + fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + return vertices; + } + + fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, String> { + let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), + self.useful_nodes().len()); + let mut g= Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), self.zone_redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - self.zone_redundancy) as u32)?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; + g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , + self.replication_factor as u32)?; + } + } + for n in 0..self.useful_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity/size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p,n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + return Ok(g); + } + + + fn compute_candidate_assignment(&self, zone_to_id: &HashMap, + old_assoc_opt : &Option >>) -> Result, String > { + + //We list the edges that are not used in the old association + let mut exclude_edge = HashSet::<(usize,usize)>::new(); + if let Some(old_assoc) = old_assoc_opt { + let nb_nodes = self.useful_nodes().len(); + for p in 0..NB_PARTITIONS { + for n in 0..nb_nodes { + exclude_edge.insert((p,n)); + } + for n in old_assoc[p].iter() { + exclude_edge.remove(&(p,*n)); + } + } + } + + //We compute the best flow using only the edges used in the old assoc + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge )?; + g.compute_maximal_flow()?; + for (p,n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + return Ok(g); + } + + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), String > { + let mut cost = CostFunction::new(); + for p in 0..NB_PARTITIONS { + for n in old_assoc[p].iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); + } + } + let nb_nodes = self.useful_nodes().len(); + let path_length = 4*nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + return Ok(()); + } + + fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), String>{ + self.ring_assignation_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; + for vertex in assoc_vertex.iter() { + match vertex{ + Vertex::N(n) => self.ring_assignation_data.push((*n).try_into().unwrap()), + _ => () + } + } + } + } + + if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { + return Err("Critical Error : the association ring we produced does not have the right size.".to_string()); + } + return Ok(()); + } + + + //This function returns a message summing up the partition repartition of the new + //layout. + fn output_stat(&self , gflow : &Graph, + old_assoc_opt : &Option< Vec> >, + zone_to_id: &HashMap, + id_to_zone : &Vec) -> Result{ + let mut msg = Message::new(); + + let nb_partitions = 1usize << PARTITION_BITS; + let used_cap = self.partition_size * nb_partitions as u32 * + self.replication_factor as u32; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); + msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + used_cap , total_cap , percent_cap )); + msg.push(format!("If the percentage is to low, it might be that the replication/redundancy constraints force the use of nodes/zones with small storage capacities. + You might want to rebalance the storage capacities or relax the constraints. See the detailed statistics below and look for saturated nodes/zones.")); + msg.push(format!("Recall that because of the replication, the actual available storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); + + //We define and fill in the following tables + let storing_nodes = self.useful_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..nb_partitions { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; + if pz_nodes.len() > 0 { + stored_partitions_zone[z] += 1; + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(old_assoc) = old_assoc_opt { + if !old_assoc[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p.push( + zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + } + + //We display the statistics + + if *old_assoc_opt != None { + let total_new_partitions : usize = new_partitions.iter().sum(); + msg.push(format!("A total of {} new copies of partitions need to be \ + transferred.", total_new_partitions)); + } + msg.push(format!("")); + msg.push(format!("Detailed statistics by zones and nodes.")); + + for z in 0..id_to_zone.len(){ + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len(){ + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions : usize = nodes_of_z.iter() + .map(|n| stored_partitions[*n]).sum(); + msg.push(format!("")); + + if *old_assoc_opt != None { + msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], + new_partitions_zone[z], replicated_partitions)); + } + else{ + msg.push(format!("Zone {}: {} distinct partitions stored ({} partition \ + copies) ", + id_to_zone[z], stored_partitions_zone[z], replicated_partitions)); + } + + let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); + msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", + available_cap_z, total_cap_z, percent_cap_z)); + msg.push(format!("")); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; + let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self.node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))?.tags_string(); + msg.push(format!(" Node {}: {} partitions ({} new) ; \ + available/total capacity: {} / {} ({:.1}%) ; tags:{}", + &self.node_id_vec[*n].to_vec().encode_hex::(), + stored_partitions[*n], + new_partitions[*n], available_cap_n, total_cap_n, + (available_cap_n as f32)/(total_cap_n as f32)*100.0 , + tags_n)); + } + } + + return Ok(msg); + } + } +//==================================================================================== + #[cfg(test)] mod tests { use super::*; diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 392ff48f..1036a8e1 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -8,9 +8,11 @@ mod consul; mod kubernetes; pub mod layout; +pub mod graph_algo; pub mod ring; pub mod system; + mod metrics; pub mod rpc_helper; diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs index 73a126a2..743a5cba 100644 --- a/src/rpc/ring.rs +++ b/src/rpc/ring.rs @@ -40,6 +40,7 @@ pub struct Ring { // Type to store compactly the id of a node in the system // Change this to u16 the day we want to have more than 256 nodes in a cluster pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; // The maximum number of times an object might get replicated // This must be at least 3 because Garage supports 3-way replication diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 68d94ea5..313671ca 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -97,6 +97,7 @@ pub struct System { kubernetes_discovery: Option, replication_factor: usize, + zone_redundancy: usize, /// The ring pub ring: watch::Receiver>, @@ -192,6 +193,7 @@ impl System { network_key: NetworkKey, background: Arc, replication_factor: usize, + zone_redundancy: usize, config: &Config, ) -> Arc { let node_key = @@ -211,7 +213,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor) + ClusterLayout::new(replication_factor, zone_redundancy) } }; @@ -285,6 +287,7 @@ impl System { rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), system_endpoint, replication_factor, + zone_redundancy, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, bootstrap_peers: config.bootstrap_peers.clone(), diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs deleted file mode 100644 index 1e1e9caa..00000000 --- a/src/util/bipartite.rs +++ /dev/null @@ -1,363 +0,0 @@ -/* - * This module deals with graph algorithm in complete bipartite - * graphs. It is used in layout.rs to build the partition to node - * assignation. - * */ - -use rand::prelude::SliceRandom; -use std::cmp::{max, min}; -use std::collections::VecDeque; - -//Graph data structure for the flow algorithm. -#[derive(Clone, Copy, Debug)] -struct EdgeFlow { - c: i32, - flow: i32, - v: usize, - rev: usize, -} - -//Graph data structure for the detection of positive cycles. -#[derive(Clone, Copy, Debug)] -struct WeightedEdge { - w: i32, - u: usize, - v: usize, -} - -/* This function takes two matchings (old_match and new_match) in a - * complete bipartite graph. It returns a matching that has the - * same degree as new_match at every vertex, and that is as close - * as possible to old_match. - * */ -pub fn optimize_matching( - old_match: &[Vec], - new_match: &[Vec], - nb_right: usize, -) -> Vec> { - let nb_left = old_match.len(); - let ed = WeightedEdge { w: -1, u: 0, v: 0 }; - let mut edge_vec = vec![ed; nb_left * nb_right]; - - //We build the complete bipartite graph structure, represented - //by the list of all edges. - for i in 0..nb_left { - for j in 0..nb_right { - edge_vec[i * nb_right + j].u = i; - edge_vec[i * nb_right + j].v = nb_left + j; - } - } - - for i in 0..edge_vec.len() { - //We add the old matchings - if old_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { - edge_vec[i].w *= -1; - } - //We add the new matchings - if new_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { - (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - //Now edge_vec is a graph where edges are oriented LR if we - //can add them to new_match, and RL otherwise. If - //adding/removing them makes the matching closer to old_match - //they have weight 1; and -1 otherwise. - - //We shuffle the edge list so that there is no bias depending in - //partitions/zone label in the triplet dispersion - let mut rng = rand::thread_rng(); - edge_vec.shuffle(&mut rng); - - //Discovering and flipping a cycle with positive weight in this - //graph will make the matching closer to old_match. - //We use Bellman Ford algorithm to discover positive cycles - while let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { - for i in cycle { - //We flip the edges of the cycle. - (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - - //The optimal matching is build from the graph structure. - let mut matching = vec![Vec::::new(); nb_left]; - for e in edge_vec { - if e.u > e.v { - matching[e.v].push(e.u - nb_left); - } - } - matching -} - -//This function finds a positive cycle in a bipartite wieghted graph. -fn positive_cycle( - edge_vec: &[WeightedEdge], - nb_left: usize, - nb_right: usize, -) -> Option> { - let nb_side_min = min(nb_left, nb_right); - let nb_vertices = nb_left + nb_right; - let weight_lowerbound = -((nb_left + nb_right) as i32) - 1; - let mut accessed = vec![false; nb_left]; - - //We try to find a positive cycle accessible from the left - //vertex i. - for i in 0..nb_left { - if accessed[i] { - continue; - } - let mut weight = vec![weight_lowerbound; nb_vertices]; - let mut prev = vec![edge_vec.len(); nb_vertices]; - weight[i] = 0; - //We compute largest weighted paths from i. - //Since the graph is bipartite, any simple cycle has length - //at most 2*nb_side_min. In the general Bellman-Ford - //algorithm, the bound here is the number of vertices. Since - //the number of partitions can be much larger than the - //number of nodes, we optimize that. - for _ in 0..(2 * nb_side_min) { - for (j, e) in edge_vec.iter().enumerate() { - if weight[e.v] < weight[e.u] + e.w { - weight[e.v] = weight[e.u] + e.w; - prev[e.v] = j; - } - } - } - //We update the accessed table - for i in 0..nb_left { - if weight[i] > weight_lowerbound { - accessed[i] = true; - } - } - //We detect positive cycle - for e in edge_vec { - if weight[e.v] < weight[e.u] + e.w { - //it means e is on a path branching from a positive cycle - let mut was_seen = vec![false; nb_vertices]; - let mut curr = e.u; - //We track back with prev until we reach the cycle. - while !was_seen[curr] { - was_seen[curr] = true; - curr = edge_vec[prev[curr]].u; - } - //Now curr is on the cycle. We collect the edges ids. - let mut cycle = vec![prev[curr]]; - let mut cycle_vert = edge_vec[prev[curr]].u; - while cycle_vert != curr { - cycle.push(prev[cycle_vert]); - cycle_vert = edge_vec[prev[cycle_vert]].u; - } - - return Some(cycle); - } - } - } - - None -} - -// This function takes two arrays of capacity and computes the -// maximal matching in the complete bipartite graph such that the -// left vertex i is matched to left_cap_vec[i] right vertices, and -// the right vertex j is matched to right_cap_vec[j] left vertices. -// To do so, we use Dinic's maximum flow algorithm. -pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) -> Vec> { - let mut graph = Vec::>::new(); - let ed = EdgeFlow { - c: 0, - flow: 0, - v: 0, - rev: 0, - }; - - // 0 will be the source - graph.push(vec![ed; left_cap_vec.len()]); - for (i, c) in left_cap_vec.iter().enumerate() { - graph[0][i].c = *c as i32; - graph[0][i].v = i + 2; - graph[0][i].rev = 0; - } - - //1 will be the sink - graph.push(vec![ed; right_cap_vec.len()]); - for (i, c) in right_cap_vec.iter().enumerate() { - graph[1][i].c = *c as i32; - graph[1][i].v = i + 2 + left_cap_vec.len(); - graph[1][i].rev = 0; - } - - //we add left vertices - for i in 0..left_cap_vec.len() { - graph.push(vec![ed; 1 + right_cap_vec.len()]); - graph[i + 2][0].c = 0; //directed - graph[i + 2][0].v = 0; - graph[i + 2][0].rev = i; - - for j in 0..right_cap_vec.len() { - graph[i + 2][j + 1].c = 1; - graph[i + 2][j + 1].v = 2 + left_cap_vec.len() + j; - graph[i + 2][j + 1].rev = i + 1; - } - } - - //we add right vertices - for i in 0..right_cap_vec.len() { - let lft_ln = left_cap_vec.len(); - graph.push(vec![ed; 1 + lft_ln]); - graph[i + lft_ln + 2][0].c = graph[1][i].c; - graph[i + lft_ln + 2][0].v = 1; - graph[i + lft_ln + 2][0].rev = i; - - for j in 0..left_cap_vec.len() { - graph[i + 2 + lft_ln][j + 1].c = 0; //directed - graph[i + 2 + lft_ln][j + 1].v = j + 2; - graph[i + 2 + lft_ln][j + 1].rev = i + 1; - } - } - - //To ensure the dispersion of the triplets generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //left vertices do not consider the right ones in the same order. - let mut rng = rand::thread_rng(); - for i in 0..graph.len() { - graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. - for j in 0..graph[i].len() { - let target_v = graph[i][j].v; - let target_rev = graph[i][j].rev; - graph[target_v][target_rev].rev = j; - } - } - - let nb_vertices = graph.len(); - - //We run Dinic's max flow algorithm - loop { - //We build the level array from Dinic's algorithm. - let mut level = vec![-1; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((0, 0)); - while !fifo.is_empty() { - if let Some((id, lvl)) = fifo.pop_front() { - if level[id] == -1 { - level[id] = lvl; - for e in graph[id].iter() { - if e.c - e.flow > 0 { - fifo.push_back((e.v, lvl + 1)); - } - } - } - } - } - if level[1] == -1 { - //There is no residual flow - break; - } - - //Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); - - let flow_upper_bound = if let Some(x) = left_cap_vec.iter().max() { - *x as i32 - } else { - panic!(); - }; - - lifo.push_back((0, flow_upper_bound)); - - while let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == 1 { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - graph[id][nbd].flow += f; - let id_v = graph[id][nbd].v; - let nbd_v = graph[id][nbd].rev; - graph[id_v][nbd_v].flow -= f; - } - } - lifo.push_back((0, flow_upper_bound)); - continue; - } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { - next_nbd[*parent] += 1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); - if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - //otherwise, we send flow to nbd. - lifo.push_back((graph[id][nbd].v, new_flow)); - } - } - - //We return the association - let assoc_table = (0..left_cap_vec.len()) - .map(|id| { - graph[id + 2] - .iter() - .filter(|e| e.flow > 0) - .map(|e| e.v - 2 - left_cap_vec.len()) - .collect() - }) - .collect(); - - //consistency check - - //it is a flow - for i in 3..graph.len() { - assert!(graph[i].iter().map(|e| e.flow).sum::() == 0); - for e in graph[i].iter() { - assert!(e.flow + graph[e.v][e.rev].flow == 0); - } - } - - //it solves the matching problem - for i in 0..left_cap_vec.len() { - assert!(left_cap_vec[i] as i32 == graph[i + 2].iter().map(|e| max(0, e.flow)).sum::()); - } - for i in 0..right_cap_vec.len() { - assert!( - right_cap_vec[i] as i32 - == graph[i + 2 + left_cap_vec.len()] - .iter() - .map(|e| max(0, e.flow)) - .sum::() - ); - } - - assoc_table -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_flow() { - let left_vec = vec![3; 8]; - let right_vec = vec![0, 4, 8, 4, 8]; - //There are asserts in the function that computes the flow - let _ = dinic_compute_matching(left_vec, right_vec); - } - - //maybe add tests relative to the matching optilization ? -} diff --git a/src/util/lib.rs b/src/util/lib.rs index 891549c3..e83fc2e6 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -4,7 +4,6 @@ extern crate tracing; pub mod background; -pub mod bipartite; pub mod config; pub mod crdt; pub mod data; -- cgit v1.2.3 From bd842e1388a324e2a3956465e9b32d0dc739a8d9 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 22 Sep 2022 19:30:01 +0200 Subject: Correction of a few bugs in the tests, modification of ClusterLayout::check --- src/rpc/graph_algo.rs | 41 ++++++------ src/rpc/layout.rs | 173 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 137 insertions(+), 77 deletions(-) (limited to 'src') diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 1a809b80..a5a1e4ba 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -182,7 +182,7 @@ impl Graph{ //assignation, we shuffle the neighbours of the nodes. Hence, //the vertices do not consider their neighbours in the same order. self.shuffle_edges(); - + //We run Dinic's max flow algorithm loop { //We build the level array from Dinic's algorithm. @@ -206,7 +206,6 @@ impl Graph{ //There is no residual flow break; } - //Now we run DFS respecting the level array let mut next_nbd = vec![0; nb_vertices]; let mut lifo = VecDeque::new(); @@ -220,14 +219,12 @@ impl Graph{ //The DFS reached the sink, we can add a //residual flow. lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i32; - let id_rev = self.graph[id][nbd].dest; - let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i32; - } + while let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i32; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i32; } lifo.push_back((idsource, flow_upper_bound)); continue; @@ -243,10 +240,14 @@ impl Graph{ continue; } //else we can try to send flow from id to its nbd - let new_flow = min(f, self.graph[id][nbd].cap - self.graph[id][nbd].flow as u32 ); + let new_flow = min(f as i32, self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow) as u32; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]){ - if lvldest <= lvlid || new_flow == 0 { + if lvldest <= lvlid { //We cannot send flow to nbd. next_nbd[id] += 1; continue; @@ -266,7 +267,6 @@ impl Graph{ // one needs to be present in the cost function. pub fn optimize_flow_with_cost(&mut self , cost: &CostFunction, path_length: usize ) -> Result<(),String>{ - //We build the weighted graph g where we will look for negative cycle let mut gf = self.build_cost_graph(cost)?; let mut cycles = gf.list_negative_cycles(path_length); @@ -364,6 +364,7 @@ impl Graph{ } } + //If self.graph contains a negative cycle, then at this point the graph described //by prev (which is a directed 1-forest/functional graph) //must contain a cycle. We list the cycles of prev. @@ -401,8 +402,9 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { //We discovered an id that we explored at this iteration t. //It means we are on a cycle let mut cy = vec![id; 1]; - let id2 = id; - while let Some(id2) = forest[id2] { + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; if id2 != id { cy.push(id2); } @@ -429,12 +431,5 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { mod tests { use super::*; - #[test] - fn test_flow() { - let left_vec = vec![3; 8]; - let right_vec = vec![0, 4, 8, 4, 8]; - //There are asserts in the function that computes the flow - } - - //maybe add tests relative to the matching optilization ? } + diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index ff60ce98..a878f19c 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::collections::HashSet; use hex::ToHex; +use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -185,7 +186,8 @@ impl ClusterLayout { pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err("The Uuid does not correspond to a node present in the cluster or this node does not have a positive capacity.".to_string()) + _ => return Err("The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity.".to_string()) } } @@ -242,6 +244,47 @@ impl ClusterLayout { } } + //Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignation_data[rf*p..rf*(p+1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return false; + } + //Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p.iter() + .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.")); + if zones_of_p.unique().count() < self.zone_redundancy { + return false; + } + } + + //Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignation_data.iter() { + node_usage[*n as usize] += 1; + } + for n in 0..MAX_NODE_NUMBER { + if node_usage[n] > 0 { + let uuid = self.node_id_vec[n]; + if node_usage[n]*self.partition_size > self.get_node_capacity(&uuid) + .expect("Critical Error"){ + return false; + } + } + } + + //Check that the partition size stored is the one computed by the asignation + //algorithm. + let cl2 = self.clone(); + let (_ , zone_to_id) = cl2.generate_zone_ids().expect("Critical Error"); + let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); + if partition_size != self.partition_size { + return false; + } + + true } @@ -267,7 +310,7 @@ impl ClusterLayout { self.zone_redundancy = redundancy; let mut msg = Message::new(); - msg.push(format!("Computation of a new cluster layout where partitions are + msg.push(format!("Computation of a new cluster layout where partitions are \ replicated {} times on at least {} distinct zones.", replication, redundancy)); //We generate for once numerical ids for the zone, to use them as indices in the @@ -276,16 +319,19 @@ impl ClusterLayout { msg.push(format!("The cluster contains {} nodes spread over {} zones.", self.useful_nodes().len(), id_to_zone.len())); - + //We compute the optimal partition size + //Capacities should be given in a unit so that partition size is at least 100. + //In this case, integer rounding plays a marginal role in the percentages of + //optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; if old_assignation_opt != None { - msg.push(format!("Given the replication and redundancy constraint, the - optimal size of a partition is {}. In the previous layout, it used to + msg.push(format!("Given the replication and redundancy constraint, the \ + optimal size of a partition is {}. In the previous layout, it used to \ be {}.", partition_size, self.partition_size)); } else { - msg.push(format!("Given the replication and redundancy constraints, the + msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; @@ -293,13 +339,13 @@ impl ClusterLayout { //We compute a first flow/assignment that is heuristically close to the previous //assignment let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { //We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; } msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); + msg.push("".to_string()); //We update the layout structure self.update_ring_from_flow(id_to_zone.len() , &gflow)?; @@ -321,7 +367,8 @@ impl ClusterLayout { .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(format!("There are more than {} non-gateway nodes in the new layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + return Err(format!("There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); } let mut new_gateway_nodes: Vec = self.roles.items().iter() @@ -346,7 +393,8 @@ impl ClusterLayout { return Ok(None); } if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err("The old assignation does not have a size corresponding to the old replication factor or the number of partitions.".to_string()); + return Err("The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions.".to_string()); } //We build a translation table between the uuid and new ids @@ -384,7 +432,8 @@ impl ClusterLayout { for uuid in self.node_id_vec.iter() { if self.roles.get(uuid) == None { - return Err("The uuid was not found in the node roles (this should not happen, it might be a critical error).".to_string()); + return Err("The uuid was not found in the node roles (this should \ + not happen, it might be a critical error).".to_string()); } match self.node_role(&uuid) { Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { @@ -405,7 +454,8 @@ impl ClusterLayout { let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err("The storage capacity of he cluster is to small. It is impossible to store partitions of size 1.".to_string()); + return Err("The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1.".to_string()); } let mut s_down = 1; @@ -525,11 +575,12 @@ impl ClusterLayout { } if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err("Critical Error : the association ring we produced does not have the right size.".to_string()); + return Err("Critical Error : the association ring we produced does not \ + have the right size.".to_string()); } return Ok(()); } - + //This function returns a message summing up the partition repartition of the new //layout. @@ -546,9 +597,16 @@ impl ClusterLayout { let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap , total_cap , percent_cap )); - msg.push(format!("If the percentage is to low, it might be that the replication/redundancy constraints force the use of nodes/zones with small storage capacities. - You might want to rebalance the storage capacities or relax the constraints. See the detailed statistics below and look for saturated nodes/zones.")); - msg.push(format!("Recall that because of the replication, the actual available storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); + msg.push(format!("")); + msg.push(format!("If the percentage is to low, it might be that the \ + replication/redundancy constraints force the use of nodes/zones with small \ + storage capacities. \ + You might want to rebalance the storage capacities or relax the constraints. \ + See the detailed statistics below and look for saturated nodes/zones.")); + msg.push(format!("Recall that because of the replication, the actual available \ + storage capacity is {} / {} = {}.", + used_cap , self.replication_factor , + used_cap/self.replication_factor as u32)); //We define and fill in the following tables let storing_nodes = self.useful_nodes(); @@ -563,6 +621,16 @@ impl ClusterLayout { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; if pz_nodes.len() > 0 { stored_partitions_zone[z] += 1; + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p.push( + zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } } for vert in pz_nodes.iter() { if let Vertex::N(n) = *vert { @@ -574,21 +642,17 @@ impl ClusterLayout { } } } - if let Some(old_assoc) = old_assoc_opt { - let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { - old_zones_of_p.push( - zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } } } + + if *old_assoc_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } //We display the statistics + msg.push(format!("")); if *old_assoc_opt != None { let total_new_partitions : usize = new_partitions.iter().sum(); msg.push(format!("A total of {} new copies of partitions need to be \ @@ -608,16 +672,9 @@ impl ClusterLayout { .map(|n| stored_partitions[*n]).sum(); msg.push(format!("")); - if *old_assoc_opt != None { - msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ + msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], new_partitions_zone[z], replicated_partitions)); - } - else{ - msg.push(format!("Zone {}: {} distinct partitions stored ({} partition \ - copies) ", - id_to_zone[z], stored_partitions_zone[z], replicated_partitions)); - } let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; let mut total_cap_z = 0; @@ -625,18 +682,17 @@ impl ClusterLayout { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); - msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", + msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", available_cap_z, total_cap_z, percent_cap_z)); - msg.push(format!("")); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; let tags_n = (self.node_role(&self.node_id_vec[*n]) .ok_or("Node not found."))?.tags_string(); - msg.push(format!(" Node {}: {} partitions ({} new) ; \ + msg.push(format!(" Node {}: {} partitions ({} new) ; \ available/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec().encode_hex::(), + &self.node_id_vec[*n].to_vec()[0..2].to_vec().encode_hex::(), stored_partitions[*n], new_partitions[*n], available_cap_n, total_cap_n, (available_cap_n as f32)/(total_cap_n as f32)*100.0 , @@ -654,16 +710,14 @@ impl ClusterLayout { #[cfg(test)] mod tests { use super::*; - use itertools::Itertools; - + use std::io::*; +// use itertools::Itertools; +/* fn check_assignation(cl: &ClusterLayout) { //Check that input data has the right format let nb_partitions = 1usize << PARTITION_BITS; - assert!([1, 2, 3].contains(&cl.replication_factor)); assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); - let (node_zone, node_capacity) = cl.get_node_zone_capacity(); - //Check that is is a correct assignation with zone redundancy let rf = cl.replication_factor; for i in 0..nb_partitions { @@ -743,6 +797,13 @@ mod tests { } } } +*/ + + fn show_msg(msg : &Message) { + for s in msg.iter(){ + println!("{}",s); + } + } fn update_layout( cl: &mut ClusterLayout, @@ -769,7 +830,8 @@ mod tests { #[test] fn test_assignation() { - let mut node_id_vec = vec![1, 2, 3]; + std::io::stdout().flush().ok().expect("Could not flush stdout"); + let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] .into_iter() @@ -782,14 +844,16 @@ mod tests { roles: LwwMap::new(), replication_factor: 3, + zone_redundancy: 1, + partition_size: 0, ring_assignation_data: vec![], version: 0, staging: LwwMap::new(), - staging_hash: sha256sum(&[1; 32]), + staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), }; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -798,17 +862,18 @@ mod tests { .map(|x| x.to_string()) .collect(); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); - node_capacity_vec = vec![4000, 4000, 2000, 7000, 1000, 9000, 2000, 10, 2000]; + node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + assert!(cl.check()); + } } -- cgit v1.2.3 From 99f96b9564c9c841dc6c56f1255a6e70ff884d46 Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 4 Oct 2022 18:09:24 +0200 Subject: deleted zone_redundancy from System struct --- src/rpc/system.rs | 2 -- 1 file changed, 2 deletions(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 313671ca..34031b10 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -97,7 +97,6 @@ pub struct System { kubernetes_discovery: Option, replication_factor: usize, - zone_redundancy: usize, /// The ring pub ring: watch::Receiver>, @@ -287,7 +286,6 @@ impl System { rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), system_endpoint, replication_factor, - zone_redundancy, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, bootstrap_peers: config.bootstrap_peers.clone(), -- cgit v1.2.3 From ceac3713d6639f9170fc3b4475fae4a30b34483c Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 5 Oct 2022 15:29:48 +0200 Subject: modifications in several files to : - have consistent error return types - store the zone redundancy in a Lww - print the error and message in the CLI (TODO: for the server Api, should msg be returned in the body response?) --- src/api/admin/cluster.rs | 7 ++- src/garage/cli/layout.rs | 35 ++++++++------ src/rpc/layout.rs | 118 +++++++++++++++++++++++++++++------------------ src/rpc/system.rs | 3 +- 4 files changed, 100 insertions(+), 63 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 99c6e332..630179b5 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -162,7 +162,12 @@ pub async fn handle_apply_cluster_layout( let param = parse_json_body::(req).await?; let layout = garage.system.get_cluster_layout(); - let layout = layout.apply_staged_changes(Some(param.version))?; + let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; + //TODO : how to display msg ? Should it be in the Body Response ? + for s in msg.iter() { + println!("{}", s); + } + garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 3884bb92..a5b838e7 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -188,19 +188,23 @@ pub async fn cmd_show_layout( // this will print the stats of what partitions // will move around when we apply - if layout.calculate_partition_assignation() { - println!("To enact the staged role changes, type:"); - println!(); - println!(" garage layout apply --version {}", layout.version + 1); - println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1 - ); - } else { - println!("Not enough nodes have an assigned role to maintain enough copies of data."); - println!("This new layout cannot yet be applied."); - } + match layout.calculate_partition_assignation() { + Ok(msg) => { + for line in msg.iter() { + println!("{}", line); + } + println!("To enact the staged role changes, type:"); + println!(); + println!(" garage layout apply --version {}", layout.version + 1); + println!(); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + layout.version + 1)}, + Err(Error::Message(s)) => { + println!("Error while trying to compute the assignation: {}", s); + println!("This new layout cannot yet be applied.");}, + _ => { println!("Unknown Error"); }, + } } Ok(()) @@ -213,7 +217,10 @@ pub async fn cmd_apply_layout( ) -> Result<(), Error> { let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.apply_staged_changes(apply_opt.version)?; + let (layout, msg) = layout.apply_staged_changes(apply_opt.version)?; + for line in msg.iter() { + println!("{}", line); + } send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 16d573c7..8d2b3e17 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; +use garage_util::crdt::{AutoCrdt, Crdt, LwwMap, Lww}; use garage_util::data::*; use garage_util::error::*; @@ -27,12 +27,10 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, - #[serde(default="default_one")] - pub zone_redundancy: usize, //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_zero")] + #[serde(default="default_partition_size")] pub partition_size: u32, pub roles: LwwMap, @@ -51,17 +49,31 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout + #[serde(default="default_layout_parameters")] + pub parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_one() -> usize{ - return 1; -} -fn default_zero() -> u32{ +fn default_partition_size() -> u32{ return 0; } +fn default_layout_parameters() -> Lww{ + Lww::::new(LayoutParameters{ zone_redundancy: 1}) +} + +///This struct is used to set the parameters to be used in the assignation computation +///algorithm. It is stored as a Crdt. +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct LayoutParameters { + pub zone_redundancy:usize, +} + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -108,18 +120,24 @@ impl NodeRole { } impl ClusterLayout { - pub fn new(replication_factor: usize, zone_redundancy: usize) -> Self { + pub fn new(replication_factor: usize) -> Self { + + //We set the default zone redundancy to be equal to the replication factor, + //i.e. as strict as possible. + let default_parameters = Lww::::new( + LayoutParameters{ zone_redundancy: replication_factor}); + let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); ClusterLayout { version: 0, replication_factor, - zone_redundancy, partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), + parameters: default_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -132,6 +150,7 @@ impl ClusterLayout { true } Ordering::Equal => { + self.parameters.merge(&other.parameters); self.staging.merge(&other.staging); let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); @@ -145,7 +164,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option) -> Result { + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self,Message), Error> { match version { None => { let error = r#" @@ -164,16 +183,14 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging); self.roles.retain(|(_, _, v)| v.0.is_some()); - if !self.calculate_partition_assignation() { - return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); - } + let msg = self.calculate_partition_assignation()?; self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); self.version += 1; - Ok(self) + Ok((self,msg)) } pub fn revert_staged_changes(mut self, version: Option) -> Result { @@ -231,24 +248,24 @@ To know the correct value of the new layout version, invoke `garage layout show` } ///Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid : &Uuid) -> Result { + pub fn get_node_zone(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(role) => return Ok(role.zone.clone()), - _ => return Err("The Uuid does not correspond to a node present in the cluster.".to_string()) + _ => return Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) } } ///Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err("The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity.".to_string()) + _ => return Err(Error::Message("The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity.".into())) } } ///Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.useful_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -311,7 +328,8 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - if zones_of_p.unique().count() < self.zone_redundancy { + let redundancy = self.parameters.get().zone_redundancy; + if zones_of_p.unique().count() < redundancy { return false; } } @@ -354,7 +372,7 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - pub fn calculate_partition_assignation(&mut self, replication:usize, redundancy:usize) -> Result { + pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. @@ -362,12 +380,12 @@ impl ClusterLayout { //We update the node ids, since the node list might have changed with the staged //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - self.replication_factor = replication; - self.zone_redundancy = redundancy; + let redundancy = self.parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ - replicated {} times on at least {} distinct zones.", replication, redundancy)); + replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); //We generate for once numerical ids for the zone, to use them as indices in the //flow graphs. @@ -381,6 +399,7 @@ impl ClusterLayout { //In this case, integer rounding plays a marginal role in the percentages of //optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ @@ -392,6 +411,12 @@ impl ClusterLayout { } self.partition_size = partition_size; + if partition_size < 100 { + msg.push("WARNING: The partition size is low (< 100), you might consider to \ + give the nodes capacities in a smaller unit (e.g. Mb instead of Gb) to \ + achieve a more tailored use of your storage ressources.".into()); + } + //We compute a first flow/assignment that is heuristically close to the previous //assignment let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; @@ -413,7 +438,7 @@ impl ClusterLayout { /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,String> { + fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,Error> { // (1) We compute the new node list //Non gateway nodes should be coded on 8bits, hence they must be first in the list //We build the new node ids @@ -423,8 +448,8 @@ impl ClusterLayout { .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", MAX_NODE_NUMBER).into() )); } let mut new_gateway_nodes: Vec = self.roles.items().iter() @@ -449,8 +474,8 @@ impl ClusterLayout { return Ok(None); } if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err("The old assignation does not have a size corresponding to \ - the old replication factor or the number of partitions.".to_string()); + return Err(Error::Message("The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions.".into())); } //We build a translation table between the uuid and new ids @@ -482,14 +507,14 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_zone_ids(&self) -> Result<(Vec, HashMap),String>{ + fn generate_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); for uuid in self.node_id_vec.iter() { if self.roles.get(uuid) == None { - return Err("The uuid was not found in the node roles (this should \ - not happen, it might be a critical error).".to_string()); + return Err(Error::Message("The uuid was not found in the node roles (this should \ + not happen, it might be a critical error).".into())); } match self.node_role(&uuid) { Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { @@ -504,14 +529,14 @@ impl ClusterLayout { ///This function computes by dichotomy the largest realizable partition size, given ///the layout. - fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ + fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ let nb_partitions = 1usize << PARTITION_BITS; let empty_set = HashSet::<(usize,usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err("The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1.".to_string()); + return Err(Error::Message("The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1.".into())); } let mut s_down = 1; @@ -545,14 +570,15 @@ impl ClusterLayout { return vertices; } - fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, String> { + fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); + let redundancy = self.parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), self.zone_redundancy as u32)?; - g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - self.zone_redundancy) as u32)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , @@ -574,7 +600,7 @@ impl ClusterLayout { fn compute_candidate_assignment(&self, zone_to_id: &HashMap, - old_assoc_opt : &Option >>) -> Result, String > { + old_assoc_opt : &Option >>) -> Result, Error > { //We list the edges that are not used in the old association let mut exclude_edge = HashSet::<(usize,usize)>::new(); @@ -601,7 +627,7 @@ impl ClusterLayout { return Ok(g); } - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), String > { + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), Error > { let mut cost = CostFunction::new(); for p in 0..NB_PARTITIONS { for n in old_assoc[p].iter() { @@ -616,7 +642,7 @@ impl ClusterLayout { return Ok(()); } - fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), String>{ + fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ self.ring_assignation_data = Vec::::new(); for p in 0..NB_PARTITIONS { for z in 0..nb_zones { @@ -631,8 +657,8 @@ impl ClusterLayout { } if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err("Critical Error : the association ring we produced does not \ - have the right size.".to_string()); + return Err(Error::Message("Critical Error : the association ring we produced does not \ + have the right size.".into())); } return Ok(()); } @@ -643,7 +669,7 @@ impl ClusterLayout { fn output_stat(&self , gflow : &Graph, old_assoc_opt : &Option< Vec> >, zone_to_id: &HashMap, - id_to_zone : &Vec) -> Result{ + id_to_zone : &Vec) -> Result{ let mut msg = Message::new(); let nb_partitions = 1usize << PARTITION_BITS; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 7eb25195..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -196,7 +196,6 @@ impl System { network_key: NetworkKey, background: Arc, replication_factor: usize, - zone_redundancy: usize, config: &Config, ) -> Result, Error> { let node_key = @@ -226,7 +225,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor, zone_redundancy) + ClusterLayout::new(replication_factor) } }; -- cgit v1.2.3 From a951b6c45273e59b98f974937aebb8ada8816ab8 Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 5 Oct 2022 16:04:19 +0200 Subject: Added a CLI command to update the parameters for the layout computation (for now, only the zone redundancy) --- src/garage/cli/layout.rs | 35 +++++++++++++++++++++++++++++++++-- src/garage/cli/structs.rs | 14 +++++++++++++- 2 files changed, 46 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index a5b838e7..6b86e46d 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -14,8 +14,8 @@ pub async fn cli_layout_command_dispatch( rpc_host: NodeID, ) -> Result<(), Error> { match cmd { - LayoutOperation::Assign(configure_opt) => { - cmd_assign_role(system_rpc_endpoint, rpc_host, configure_opt).await + LayoutOperation::Assign(assign_opt) => { + cmd_assign_role(system_rpc_endpoint, rpc_host, assign_opt).await } LayoutOperation::Remove(remove_opt) => { cmd_remove_role(system_rpc_endpoint, rpc_host, remove_opt).await @@ -27,6 +27,9 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Revert(revert_opt) => { cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await } + LayoutOperation::Config(config_opt) => { + cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await + } } } @@ -245,6 +248,34 @@ pub async fn cmd_revert_layout( Ok(()) } +pub async fn cmd_config_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, + config_opt: ConfigLayoutOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + match config_opt.redundancy { + None => (), + Some(r) => { + if r > layout.replication_factor { + println!("The zone redundancy must be smaller or equal to the \ + replication factor ({}).", layout.replication_factor); + } + else if r < 1 { + println!("The zone redundancy must be at least 1."); + } + else { + layout.parameters.update(LayoutParameters{ zone_redundancy: r }); + println!("The new zone redundancy has been staged."); + } + } + } + + send_layout(rpc_cli, rpc_host, layout).await?; + Ok(()) +} + // --- utility --- pub async fn fetch_layout( diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 06548e89..896379bb 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -86,6 +86,10 @@ pub enum LayoutOperation { /// Remove role from Garage cluster node #[structopt(name = "remove", version = garage_version())] Remove(RemoveRoleOpt), + + /// Configure parameters value for the layout computation + #[structopt(name = "config", version = garage_version())] + Config(ConfigLayoutOpt), /// Show roles currently assigned to nodes and changes staged for commit #[structopt(name = "show", version = garage_version())] @@ -100,6 +104,7 @@ pub enum LayoutOperation { Revert(RevertLayoutOpt), } + #[derive(StructOpt, Debug)] pub struct AssignRoleOpt { /// Node(s) to which to assign role (prefix of hexadecimal node id) @@ -110,7 +115,7 @@ pub struct AssignRoleOpt { #[structopt(short = "z", long = "zone")] pub(crate) zone: Option, - /// Capacity (in relative terms, use 1 to represent your smallest server) + /// Capacity (in relative terms) #[structopt(short = "c", long = "capacity")] pub(crate) capacity: Option, @@ -133,6 +138,13 @@ pub struct RemoveRoleOpt { pub(crate) node_id: String, } +#[derive(StructOpt, Debug)] +pub struct ConfigLayoutOpt { + /// Zone redundancy parameter + #[structopt(short = "r", long = "redundancy")] + pub(crate) redundancy: Option, +} + #[derive(StructOpt, Debug)] pub struct ApplyLayoutOpt { /// Version number of new configuration: this command will fail if -- cgit v1.2.3 From 9407df60cc00fc70c10f73bc4b600085789d5353 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 12:54:51 +0200 Subject: Corrected two bugs: - self.node_id_vec was not properly updated when the previous ring was empty - ClusterLayout::merge was not considering changes in the layout parameters --- src/garage/cli/layout.rs | 6 +++++- src/rpc/layout.rs | 56 ++++++++++++++++++++++++++++++------------------ src/rpc/system.rs | 1 + 3 files changed, 41 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 6b86e46d..9e5bdaea 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -188,6 +188,10 @@ pub async fn cmd_show_layout( println!("No nodes have a role in the new layout."); } println!(); + + println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); + println!("Zone redundancy: {}", layout.parameters.get().zone_redundancy); + println!(); // this will print the stats of what partitions // will move around when we apply @@ -267,7 +271,7 @@ pub async fn cmd_config_layout( } else { layout.parameters.update(LayoutParameters{ zone_redundancy: r }); - println!("The new zone redundancy has been staged."); + println!("The new zone redundancy has been saved ({}).", r); } } } diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 8d2b3e17..89c18c68 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -150,15 +150,17 @@ impl ClusterLayout { true } Ordering::Equal => { + let param_changed = self.parameters.get() != other.parameters.get(); self.parameters.merge(&other.parameters); self.staging.merge(&other.staging); + let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - let changed = new_staging_hash != self.staging_hash; + let stage_changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; - changed + stage_changed || param_changed } Ordering::Less => false, } @@ -352,7 +354,7 @@ To know the correct value of the new layout version, invoke `garage layout show` //Check that the partition size stored is the one computed by the asignation //algorithm. let cl2 = self.clone(); - let (_ , zone_to_id) = cl2.generate_zone_ids().expect("Critical Error"); + let (_ , zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); if partition_size != self.partition_size { return false; @@ -371,13 +373,14 @@ impl ClusterLayout { /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of - /// data to be moved. + /// data to be moved. + /// Staged changes must be merged with nodes roles before calling this function. pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - - //We update the node ids, since the node list might have changed with the staged + + //We update the node ids, since the node role list might have changed with the //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; @@ -387,12 +390,23 @@ impl ClusterLayout { msg.push(format!("Computation of a new cluster layout where partitions are \ replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); - //We generate for once numerical ids for the zone, to use them as indices in the - //flow graphs. - let (id_to_zone , zone_to_id) = self.generate_zone_ids()?; + //We generate for once numerical ids for the zones of non gateway nodes, + //to use them as indices in the flow graphs. + let (id_to_zone , zone_to_id) = self.generate_useful_zone_ids()?; + let nb_useful_nodes = self.useful_nodes().len(); msg.push(format!("The cluster contains {} nodes spread over {} zones.", - self.useful_nodes().len(), id_to_zone.len())); + nb_useful_nodes, id_to_zone.len())); + if nb_useful_nodes < self.replication_factor{ + return Err(Error::Message(format!("The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_useful_nodes, self.replication_factor))); + } + if id_to_zone.len() < redundancy { + return Err(Error::Message(format!("The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len() , redundancy))); + } //We compute the optimal partition size //Capacities should be given in a unit so that partition size is at least 100. @@ -413,8 +427,7 @@ impl ClusterLayout { if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ - give the nodes capacities in a smaller unit (e.g. Mb instead of Gb) to \ - achieve a more tailored use of your storage ressources.".into()); + provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb).".into()); } //We compute a first flow/assignment that is heuristically close to the previous @@ -456,12 +469,14 @@ impl ClusterLayout { .filter(|(_, _, v)| match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) .map(|(k, _, _)| *k).collect(); - + let nb_useful_nodes = new_non_gateway_nodes.len(); let mut new_node_id_vec = Vec::::new(); new_node_id_vec.append(&mut new_non_gateway_nodes); new_node_id_vec.append(&mut new_gateway_nodes); + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); // (2) We retrieve the old association //We rewrite the old association with the new indices. We only consider partition @@ -490,15 +505,14 @@ impl ClusterLayout { let rf= self.replication_factor; for p in 0..nb_partitions { for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { - let uuid = self.node_id_vec[*old_id as usize]; + let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { old_assignation[p].push(uuid_to_new_id[&uuid]); } } } - //We write the results - self.node_id_vec = new_node_id_vec; + //We write the ring self.ring_assignation_data = Vec::::new(); return Ok(Some(old_assignation)); @@ -507,11 +521,11 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ + fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - - for uuid in self.node_id_vec.iter() { + + for uuid in self.useful_nodes().iter() { if self.roles.get(uuid) == None { return Err(Error::Message("The uuid was not found in the node roles (this should \ not happen, it might be a critical error).".into())); @@ -685,7 +699,7 @@ impl ClusterLayout { storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ See the detailed statistics below and look for saturated nodes/zones.")); - msg.push(format!("Recall that because of the replication, the actual available \ + msg.push(format!("Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); @@ -741,7 +755,7 @@ impl ClusterLayout { transferred.", total_new_partitions)); } msg.push(format!("")); - msg.push(format!("Detailed statistics by zones and nodes.")); + msg.push(format!("==== DETAILED STATISTICS BY ZONES AND NODES ====")); for z in 0..id_to_zone.len(){ let mut nodes_of_z = Vec::::new(); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 9e0bfa11..655d21de 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,6 +565,7 @@ impl System { return Err(Error::Message(msg)); } + let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From 911eb17bd9e25f2f02fbe1de81a3384e99ea13ac Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 14:53:57 +0200 Subject: corrected warnings of cargo clippy --- src/rpc/graph_algo.rs | 26 ++++++------ src/rpc/layout.rs | 111 ++++++++++++++++++++++++-------------------------- 2 files changed, 66 insertions(+), 71 deletions(-) (limited to 'src') diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index a5a1e4ba..4e27631a 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -59,10 +59,10 @@ pub type CostFunction = HashMap<(Vertex,Vertex), i32>; impl Graph{ pub fn new(vertices : &[Vertex]) -> Self { let mut map = HashMap::::new(); - for i in 0..vertices.len() { - map.insert(vertices[i] , i); + for (i, vert) in vertices.iter().enumerate(){ + map.insert(*vert , i); } - return Graph:: { + Graph:: { vertextoid : map, idtovertex: vertices.to_vec(), graph : vec![Vec::< E >::new(); vertices.len() ] @@ -99,7 +99,7 @@ impl Graph{ result.push(self.idtovertex[edge.dest]); } } - return Ok(result); + Ok(result) } @@ -113,7 +113,7 @@ impl Graph{ for edge in self.graph[idv].iter() { result += max(0,self.graph[edge.dest][edge.rev].flow); } - return Ok(result); + Ok(result) } //This function returns the value of the flow outgoing from v. @@ -126,13 +126,13 @@ impl Graph{ for edge in self.graph[idv].iter() { result += max(0,edge.flow); } - return Ok(result); + Ok(result) } //This function computes the flow total value by computing the outgoing flow //from the source. pub fn get_flow_value(&mut self) -> Result { - return self.get_outflow(Vertex::Source); + self.get_outflow(Vertex::Source) } //This function shuffles the order of the edge lists. It keeps the ids of the @@ -157,7 +157,7 @@ impl Graph{ for edge in self.graph[idsource].iter(){ flow_upper_bound += edge.cap; } - return flow_upper_bound; + flow_upper_bound } //This function computes the maximal flow using Dinic's algorithm. It starts with @@ -270,7 +270,7 @@ impl Graph{ //We build the weighted graph g where we will look for negative cycle let mut gf = self.build_cost_graph(cost)?; let mut cycles = gf.list_negative_cycles(path_length); - while cycles.len() > 0 { + while !cycles.is_empty() { //we enumerate negative cycles for c in cycles.iter(){ for i in 0..c.len(){ @@ -293,7 +293,7 @@ impl Graph{ gf = self.build_cost_graph(cost)?; cycles = gf.list_negative_cycles(path_length); } - return Ok(()); + Ok(()) } //Construct the weighted graph G_f from the flow and the cost function @@ -319,7 +319,7 @@ impl Graph{ } } } - return Ok(g); + Ok(g) } @@ -334,7 +334,7 @@ impl Graph{ } let idu = self.vertextoid[&u]; let idv = self.vertextoid[&v]; - self.graph[idu].push( WeightedEdge{w: w , dest: idv} ); + self.graph[idu].push( WeightedEdge{ w , dest: idv} ); Ok(()) } @@ -415,7 +415,7 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { cycles.push(cy); } } - return cycles; + cycles } diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 89c18c68..1969b721 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -56,7 +56,7 @@ pub struct ClusterLayout { } fn default_partition_size() -> u32{ - return 0; + 0 } fn default_layout_parameters() -> Lww{ @@ -107,15 +107,15 @@ impl NodeRole { pub fn tags_string(&self) -> String { let mut tags = String::new(); - if self.tags.len() == 0 { + if self.tags.is_empty() { return tags } tags.push_str(&self.tags[0].clone()); for t in 1..self.tags.len(){ - tags.push_str(","); + tags.push(','); tags.push_str(&self.tags[t].clone()); } - return tags; + tags } } @@ -246,22 +246,22 @@ To know the correct value of the new layout version, invoke `garage layout show` _ => () } } - return result; + result } ///Given a node uuids, this function returns the label of its zone pub fn get_node_zone(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { - Some(role) => return Ok(role.zone.clone()), - _ => return Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) } } ///Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { - Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err(Error::Message("The Uuid does not correspond to a node present in the \ + Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => Ok(*cap), + _ => Err(Error::Message("The Uuid does not correspond to a node present in the \ cluster or this node does not have a positive capacity.".into())) } } @@ -272,7 +272,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for uuid in self.useful_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; } - return Ok(total_capacity); + Ok(total_capacity) } @@ -341,10 +341,10 @@ To know the correct value of the new layout version, invoke `garage layout show` for n in self.ring_assignation_data.iter() { node_usage[*n as usize] += 1; } - for n in 0..MAX_NODE_NUMBER { - if node_usage[n] > 0 { + for (n, usage) in node_usage.iter().enumerate(){ + if *usage > 0 { let uuid = self.node_id_vec[n]; - if node_usage[n]*self.partition_size > self.get_node_capacity(&uuid) + if usage*self.partition_size > self.get_node_capacity(&uuid) .expect("Critical Error"){ return false; } @@ -435,7 +435,7 @@ impl ClusterLayout { let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { //We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); @@ -443,7 +443,7 @@ impl ClusterLayout { //We update the layout structure self.update_ring_from_flow(id_to_zone.len() , &gflow)?; - return Ok(msg); + Ok(msg) } /// The LwwMap of node roles might have changed. This function updates the node_id_vec @@ -456,21 +456,18 @@ impl ClusterLayout { //Non gateway nodes should be coded on 8bits, hence they must be first in the list //We build the new node ids let mut new_non_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| - match &v.0 {Some(r) if r.capacity != None => true, _=> false }) + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER).into() )); + layout. This is not allowed.", MAX_NODE_NUMBER) )); } let mut new_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| - match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) .map(|(k, _, _)| *k).collect(); - let nb_useful_nodes = new_non_gateway_nodes.len(); let mut new_node_id_vec = Vec::::new(); new_node_id_vec.append(&mut new_non_gateway_nodes); new_node_id_vec.append(&mut new_gateway_nodes); @@ -484,7 +481,7 @@ impl ClusterLayout { let nb_partitions = 1usize << PARTITION_BITS; let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; - if self.ring_assignation_data.len() == 0 { + if self.ring_assignation_data.is_empty() { //This is a new association return Ok(None); } @@ -498,16 +495,16 @@ impl ClusterLayout { //We add the indices of only the new non-gateway nodes that can be used in the //association ring - for i in 0..nb_useful_nodes { - uuid_to_new_id.insert(new_node_id_vec[i], i ); + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i ); } let rf= self.replication_factor; - for p in 0..nb_partitions { + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { - old_assignation[p].push(uuid_to_new_id[&uuid]); + old_assign_p.push(uuid_to_new_id[&uuid]); } } } @@ -515,7 +512,7 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); - return Ok(Some(old_assignation)); + Ok(Some(old_assignation)) } @@ -530,15 +527,14 @@ impl ClusterLayout { return Err(Error::Message("The uuid was not found in the node roles (this should \ not happen, it might be a critical error).".into())); } - match self.node_role(&uuid) { - Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone() , id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - _ => () + if let Some(r) = self.node_role(uuid) { + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone() , id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } } } - return Ok((id_to_zone, zone_to_id)); + Ok((id_to_zone, zone_to_id)) } ///This function computes by dichotomy the largest realizable partition size, given @@ -566,7 +562,7 @@ impl ClusterLayout { } } - return Ok(s_down); + Ok(s_down) } fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { @@ -581,7 +577,7 @@ impl ClusterLayout { for n in 0..nb_nodes { vertices.push(Vertex::N(n)); } - return vertices; + vertices } fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { @@ -609,7 +605,7 @@ impl ClusterLayout { } } } - return Ok(g); + Ok(g) } @@ -620,11 +616,11 @@ impl ClusterLayout { let mut exclude_edge = HashSet::<(usize,usize)>::new(); if let Some(old_assoc) = old_assoc_opt { let nb_nodes = self.useful_nodes().len(); - for p in 0..NB_PARTITIONS { + for (p, old_assoc_p) in old_assoc.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p,n)); } - for n in old_assoc[p].iter() { + for n in old_assoc_p.iter() { exclude_edge.remove(&(p,*n)); } } @@ -638,13 +634,13 @@ impl ClusterLayout { g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; - return Ok(g); + Ok(g) } - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), Error > { + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &[Vec ]) -> Result<(), Error > { let mut cost = CostFunction::new(); - for p in 0..NB_PARTITIONS { - for n in old_assoc[p].iter() { + for (p, assoc_p) in old_assoc.iter().enumerate(){ + for n in assoc_p.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); } @@ -653,7 +649,7 @@ impl ClusterLayout { let path_length = 4*nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; - return Ok(()); + Ok(()) } fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ @@ -662,9 +658,8 @@ impl ClusterLayout { for z in 0..nb_zones { let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; for vertex in assoc_vertex.iter() { - match vertex{ - Vertex::N(n) => self.ring_assignation_data.push((*n).try_into().unwrap()), - _ => () + if let Vertex::N(n) = vertex { + self.ring_assignation_data.push((*n).try_into().unwrap()); } } } @@ -674,7 +669,7 @@ impl ClusterLayout { return Err(Error::Message("Critical Error : the association ring we produced does not \ have the right size.".into())); } - return Ok(()); + Ok(()) } @@ -683,7 +678,7 @@ impl ClusterLayout { fn output_stat(&self , gflow : &Graph, old_assoc_opt : &Option< Vec> >, zone_to_id: &HashMap, - id_to_zone : &Vec) -> Result{ + id_to_zone : &[String]) -> Result{ let mut msg = Message::new(); let nb_partitions = 1usize << PARTITION_BITS; @@ -693,12 +688,12 @@ impl ClusterLayout { let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap , total_cap , percent_cap )); - msg.push(format!("")); - msg.push(format!("If the percentage is to low, it might be that the \ + msg.push("".into()); + msg.push("If the percentage is to low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones.")); + See the detailed statistics below and look for saturated nodes/zones.".into()); msg.push(format!("Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", used_cap , self.replication_factor , @@ -715,7 +710,7 @@ impl ClusterLayout { for p in 0..nb_partitions { for z in 0..id_to_zone.len() { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - if pz_nodes.len() > 0 { + if !pz_nodes.is_empty() { stored_partitions_zone[z] += 1; if let Some(old_assoc) = old_assoc_opt { let mut old_zones_of_p = Vec::::new(); @@ -748,14 +743,14 @@ impl ClusterLayout { //We display the statistics - msg.push(format!("")); + msg.push("".into()); if *old_assoc_opt != None { let total_new_partitions : usize = new_partitions.iter().sum(); msg.push(format!("A total of {} new copies of partitions need to be \ transferred.", total_new_partitions)); } - msg.push(format!("")); - msg.push(format!("==== DETAILED STATISTICS BY ZONES AND NODES ====")); + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); for z in 0..id_to_zone.len(){ let mut nodes_of_z = Vec::::new(); @@ -766,7 +761,7 @@ impl ClusterLayout { } let replicated_partitions : usize = nodes_of_z.iter() .map(|n| stored_partitions[*n]).sum(); - msg.push(format!("")); + msg.push("".into()); msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], @@ -796,7 +791,7 @@ impl ClusterLayout { } } - return Ok(msg); + Ok(msg) } } -- cgit v1.2.3 From fcf9ac674a2842b2b55d933e60af5af93dcc4592 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:19:25 +0200 Subject: Tests written in layout.rs added staged_parameters to ClusterLayout removed the serde(default) -> will need a migration function --- src/db/lib.rs | 2 +- src/garage/cli/layout.rs | 4 +- src/rpc/graph_algo.rs | 14 --- src/rpc/layout.rs | 232 +++++++++++++++++++++-------------------------- 4 files changed, 107 insertions(+), 145 deletions(-) (limited to 'src') diff --git a/src/db/lib.rs b/src/db/lib.rs index d96586be..af539494 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -3,7 +3,7 @@ extern crate tracing; #[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); +//compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); #[cfg(feature = "lmdb")] pub mod lmdb_adapter; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 9e5bdaea..32f637eb 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -190,7 +190,7 @@ pub async fn cmd_show_layout( println!(); println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!("Zone redundancy: {}", layout.parameters.get().zone_redundancy); + println!("Zone redundancy: {}", layout.staged_parameters.get().zone_redundancy); println!(); // this will print the stats of what partitions @@ -270,7 +270,7 @@ pub async fn cmd_config_layout( println!("The zone redundancy must be at least 1."); } else { - layout.parameters.update(LayoutParameters{ zone_redundancy: r }); + layout.staged_parameters.update(LayoutParameters{ zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } } diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 4e27631a..70ccf35a 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -419,17 +419,3 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { } -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== - - -#[cfg(test)] -mod tests { - use super::*; - -} - diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 1969b721..976f94af 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -30,8 +30,8 @@ pub struct ClusterLayout { //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_partition_size")] pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -49,20 +49,11 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - #[serde(default="default_layout_parameters")] - pub parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_partition_size() -> u32{ - 0 -} - -fn default_layout_parameters() -> Lww{ - Lww::::new(LayoutParameters{ zone_redundancy: 1}) -} - ///This struct is used to set the parameters to be used in the assignation computation ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -124,8 +115,8 @@ impl ClusterLayout { //We set the default zone redundancy to be equal to the replication factor, //i.e. as strict as possible. - let default_parameters = Lww::::new( - LayoutParameters{ zone_redundancy: replication_factor}); + let parameters = LayoutParameters{ zone_redundancy: replication_factor}; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -137,7 +128,8 @@ impl ClusterLayout { roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters: default_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -150,8 +142,8 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.parameters.get() != other.parameters.get(); - self.parameters.merge(&other.parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); @@ -330,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; if zones_of_p.unique().count() < redundancy { return false; } @@ -384,7 +376,8 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ @@ -417,13 +410,15 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {}.", partition_size, self.partition_size)); + be {} (the zone redundancy was {}).", partition_size, self.partition_size, + self.parameters.zone_redundancy)); } else { msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ @@ -511,6 +506,10 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); + } Ok(Some(old_assignation)) } @@ -585,7 +584,7 @@ impl ClusterLayout { self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; @@ -800,96 +799,80 @@ impl ClusterLayout { #[cfg(test)] mod tests { - use super::*; - use std::io::*; -// use itertools::Itertools; -/* - fn check_assignation(cl: &ClusterLayout) { - //Check that input data has the right format - let nb_partitions = 1usize << PARTITION_BITS; - assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); - - //Check that is is a correct assignation with zone redundancy - let rf = cl.replication_factor; - for i in 0..nb_partitions { - assert!( - rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] - .iter() - .map(|nod| node_zone[*nod as usize].clone()) - .unique() - .count() - ); - } + use super::{*,Error}; + use std::cmp::min; + + + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size +1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } - let nb_nodes = cl.node_id_vec.len(); - //Check optimality - let node_nb_part = (0..nb_nodes) - .map(|i| { - cl.ring_assignation_data - .iter() - .filter(|x| **x == i as u8) - .count() - }) - .collect::>(); + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec - .iter() - .map(|z| { - cl.ring_assignation_data - .iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - }) - .collect::>(); + let mut id_zone_token = vec![0; zones.len()]; + for (z,t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition - - if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = (0..nb_nodes) - .filter(|i| { - if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { - zone_nb_part[p] < nb_partitions - } else { - false - } - }) - .max_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 || + (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); - if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = node_of_z_iter.min_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } - } - } -*/ - fn show_msg(msg : &Message) { for s in msg.iter(){ println!("{}",s); @@ -901,6 +884,7 @@ mod tests { node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, + zone_redundancy: usize ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -917,11 +901,11 @@ mod tests { ); cl.roles.merge(&update); } + cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); } #[test] fn test_assignation() { - std::io::stdout().flush().ok().expect("Could not flush stdout"); let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] @@ -929,22 +913,11 @@ mod tests { .map(|x| x.to_string()) .collect(); - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles: LwwMap::new(), - - replication_factor: 3, - zone_redundancy: 1, - partition_size: 0, - ring_assignation_data: vec![], - version: 0, - staging: LwwMap::new(), - staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + let mut cl = ClusterLayout::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -952,19 +925,22 @@ mod tests { .into_iter() .map(|x| x.to_string()) .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); } } -- cgit v1.2.3 From 4abab246f1113a9a1988fdfca81c1dd8ffa323c8 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:21:13 +0200 Subject: cargo fmt --- src/api/admin/cluster.rs | 8 +- src/db/lib.rs | 1 - src/garage/cli/layout.rs | 94 ++-- src/garage/cli/structs.rs | 7 +- src/rpc/graph_algo.rs | 754 ++++++++++++------------- src/rpc/layout.rs | 1332 ++++++++++++++++++++++++--------------------- src/rpc/lib.rs | 3 +- src/rpc/system.rs | 1 - 8 files changed, 1162 insertions(+), 1038 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 630179b5..da3d8c44 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -163,10 +163,10 @@ pub async fn handle_apply_cluster_layout( let layout = garage.system.get_cluster_layout(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - //TODO : how to display msg ? Should it be in the Body Response ? - for s in msg.iter() { - println!("{}", s); - } + //TODO : how to display msg ? Should it be in the Body Response ? + for s in msg.iter() { + println!("{}", s); + } garage.system.update_cluster_layout(&layout).await?; diff --git a/src/db/lib.rs b/src/db/lib.rs index af539494..0a776a91 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -4,7 +4,6 @@ extern crate tracing; #[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] //compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); - #[cfg(feature = "lmdb")] pub mod lmdb_adapter; #[cfg(feature = "sled")] diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 32f637eb..f747fbe4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -27,9 +27,9 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Revert(revert_opt) => { cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await } - LayoutOperation::Config(config_opt) => { - cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await - } + LayoutOperation::Config(config_opt) => { + cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await + } } } @@ -188,30 +188,37 @@ pub async fn cmd_show_layout( println!("No nodes have a role in the new layout."); } println!(); - + println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!("Zone redundancy: {}", layout.staged_parameters.get().zone_redundancy); + println!( + "Zone redundancy: {}", + layout.staged_parameters.get().zone_redundancy + ); println!(); // this will print the stats of what partitions // will move around when we apply - match layout.calculate_partition_assignation() { - Ok(msg) => { - for line in msg.iter() { - println!("{}", line); - } - println!("To enact the staged role changes, type:"); - println!(); - println!(" garage layout apply --version {}", layout.version + 1); - println!(); - println!( + match layout.calculate_partition_assignation() { + Ok(msg) => { + for line in msg.iter() { + println!("{}", line); + } + println!("To enact the staged role changes, type:"); + println!(); + println!(" garage layout apply --version {}", layout.version + 1); + println!(); + println!( "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1)}, - Err(Error::Message(s)) => { - println!("Error while trying to compute the assignation: {}", s); - println!("This new layout cannot yet be applied.");}, - _ => { println!("Unknown Error"); }, - } + layout.version + 1) + } + Err(Error::Message(s)) => { + println!("Error while trying to compute the assignation: {}", s); + println!("This new layout cannot yet be applied."); + } + _ => { + println!("Unknown Error"); + } + } } Ok(()) @@ -225,9 +232,9 @@ pub async fn cmd_apply_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; let (layout, msg) = layout.apply_staged_changes(apply_opt.version)?; - for line in msg.iter() { - println!("{}", line); - } + for line in msg.iter() { + println!("{}", line); + } send_layout(rpc_cli, rpc_host, layout).await?; @@ -258,26 +265,29 @@ pub async fn cmd_config_layout( config_opt: ConfigLayoutOpt, ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - - match config_opt.redundancy { - None => (), - Some(r) => { - if r > layout.replication_factor { - println!("The zone redundancy must be smaller or equal to the \ - replication factor ({}).", layout.replication_factor); - } - else if r < 1 { - println!("The zone redundancy must be at least 1."); - } - else { - layout.staged_parameters.update(LayoutParameters{ zone_redundancy: r }); - println!("The new zone redundancy has been saved ({}).", r); - } - } - } + + match config_opt.redundancy { + None => (), + Some(r) => { + if r > layout.replication_factor { + println!( + "The zone redundancy must be smaller or equal to the \ + replication factor ({}).", + layout.replication_factor + ); + } else if r < 1 { + println!("The zone redundancy must be at least 1."); + } else { + layout + .staged_parameters + .update(LayoutParameters { zone_redundancy: r }); + println!("The new zone redundancy has been saved ({}).", r); + } + } + } send_layout(rpc_cli, rpc_host, layout).await?; - Ok(()) + Ok(()) } // --- utility --- diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 896379bb..02ed8992 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -86,10 +86,10 @@ pub enum LayoutOperation { /// Remove role from Garage cluster node #[structopt(name = "remove", version = garage_version())] Remove(RemoveRoleOpt), - - /// Configure parameters value for the layout computation + + /// Configure parameters value for the layout computation #[structopt(name = "config", version = garage_version())] - Config(ConfigLayoutOpt), + Config(ConfigLayoutOpt), /// Show roles currently assigned to nodes and changes staged for commit #[structopt(name = "show", version = garage_version())] @@ -104,7 +104,6 @@ pub enum LayoutOperation { Revert(RevertLayoutOpt), } - #[derive(StructOpt, Debug)] pub struct AssignRoleOpt { /// Node(s) to which to assign role (prefix of hexadecimal node id) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 70ccf35a..13c60692 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -1,42 +1,40 @@ - //! This module deals with graph algorithms. //! It is used in layout.rs to build the partition to node assignation. use rand::prelude::SliceRandom; use std::cmp::{max, min}; -use std::collections::VecDeque; use std::collections::HashMap; +use std::collections::VecDeque; //Vertex data structures used in all the graphs used in layout.rs. //usize parameters correspond to node/zone/partitions ids. //To understand the vertex roles below, please refer to the formal description //of the layout computation algorithm. -#[derive(Clone,Copy,Debug, PartialEq, Eq, Hash)] -pub enum Vertex{ - Source, - Pup(usize), //The vertex p+ of partition p - Pdown(usize), //The vertex p- of partition p - PZ(usize,usize), //The vertex corresponding to x_(partition p, zone z) - N(usize), //The vertex corresponding to node n - Sink +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Vertex { + Source, + Pup(usize), //The vertex p+ of partition p + Pdown(usize), //The vertex p- of partition p + PZ(usize, usize), //The vertex corresponding to x_(partition p, zone z) + N(usize), //The vertex corresponding to node n + Sink, } - //Edge data structure for the flow algorithm. //The graph is stored as an adjacency list #[derive(Clone, Copy, Debug)] pub struct FlowEdge { - cap: u32, //flow maximal capacity of the edge - flow: i32, //flow value on the edge - dest: usize, //destination vertex id - rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v + cap: u32, //flow maximal capacity of the edge + flow: i32, //flow value on the edge + dest: usize, //destination vertex id + rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v } //Edge data structure for the detection of negative cycles. //The graph is stored as a list of edges (u,v). #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { - w: i32, //weight of the edge + w: i32, //weight of the edge dest: usize, } @@ -47,375 +45,377 @@ impl Edge for WeightedEdge {} //Struct for the graph structure. We do encapsulation here to be able to both //provide user friendly Vertex enum to address vertices, and to use usize indices //and Vec instead of HashMap in the graph algorithm to optimize execution speed. -pub struct Graph{ - vertextoid : HashMap, - idtovertex : Vec, - - graph : Vec< Vec > -} +pub struct Graph { + vertextoid: HashMap, + idtovertex: Vec, -pub type CostFunction = HashMap<(Vertex,Vertex), i32>; - -impl Graph{ - pub fn new(vertices : &[Vertex]) -> Self { - let mut map = HashMap::::new(); - for (i, vert) in vertices.iter().enumerate(){ - map.insert(*vert , i); - } - Graph:: { - vertextoid : map, - idtovertex: vertices.to_vec(), - graph : vec![Vec::< E >::new(); vertices.len() ] - } - } + graph: Vec>, } -impl Graph{ - //This function adds a directed edge to the graph with capacity c, and the - //corresponding reversed edge with capacity 0. - pub fn add_edge(&mut self, u: Vertex, v:Vertex, c: u32) -> Result<(), String>{ - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; - let rev_u = self.graph[idu].len(); - let rev_v = self.graph[idv].len(); - self.graph[idu].push( FlowEdge{cap: c , dest: idv , flow: 0, rev : rev_v} ); - self.graph[idv].push( FlowEdge{cap: 0 , dest: idu , flow: 0, rev : rev_u} ); - Ok(()) - } - - //This function returns the list of vertices that receive a positive flow from - //vertex v. - pub fn get_positive_flow_from(&self , v:Vertex) -> Result< Vec , String>{ - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; - let mut result = Vec::::new(); - for edge in self.graph[idv].iter() { - if edge.flow > 0 { - result.push(self.idtovertex[edge.dest]); - } - } - Ok(result) - } - - - //This function returns the value of the flow incoming to v. - pub fn get_inflow(&self , v:Vertex) -> Result< i32 , String>{ - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0,self.graph[edge.dest][edge.rev].flow); - } - Ok(result) - } - - //This function returns the value of the flow outgoing from v. - pub fn get_outflow(&self , v:Vertex) -> Result< i32 , String>{ - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0,edge.flow); - } - Ok(result) - } - - //This function computes the flow total value by computing the outgoing flow - //from the source. - pub fn get_flow_value(&mut self) -> Result { - self.get_outflow(Vertex::Source) - } - - //This function shuffles the order of the edge lists. It keeps the ids of the - //reversed edges consistent. - fn shuffle_edges(&mut self) { - let mut rng = rand::thread_rng(); - for i in 0..self.graph.len() { - self.graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. - for j in 0..self.graph[i].len() { - let target_v = self.graph[i][j].dest; - let target_rev = self.graph[i][j].rev; - self.graph[target_v][target_rev].rev = j; - } - } - } - - //Computes an upper bound of the flow n the graph - pub fn flow_upper_bound(&self) -> u32{ - let idsource = self.vertextoid[&Vertex::Source]; - let mut flow_upper_bound = 0; - for edge in self.graph[idsource].iter(){ - flow_upper_bound += edge.cap; - } - flow_upper_bound - } - - //This function computes the maximal flow using Dinic's algorithm. It starts with - //the flow values already present in the graph. So it is possible to add some edge to - //the graph, compute a flow, add other edges, update the flow. - pub fn compute_maximal_flow(&mut self) -> Result<(), String> { - if !self.vertextoid.contains_key(&Vertex::Source) { - return Err("The graph does not contain a source.".to_string()); - } - if !self.vertextoid.contains_key(&Vertex::Sink) { - return Err("The graph does not contain a sink.".to_string()); - } - - let idsource = self.vertextoid[&Vertex::Source]; - let idsink = self.vertextoid[&Vertex::Sink]; - - let nb_vertices = self.graph.len(); - - let flow_upper_bound = self.flow_upper_bound(); - - //To ensure the dispersion of the associations generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //the vertices do not consider their neighbours in the same order. - self.shuffle_edges(); - - //We run Dinic's max flow algorithm - loop { - //We build the level array from Dinic's algorithm. - let mut level = vec![None; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((idsource, 0)); - while !fifo.is_empty() { - if let Some((id, lvl)) = fifo.pop_front() { - if level[id] == None { //it means id has not yet been reached - level[id] = Some(lvl); - for edge in self.graph[id].iter() { - if edge.cap as i32 - edge.flow > 0 { - fifo.push_back((edge.dest, lvl + 1)); - } - } - } - } - } - if level[idsink] == None { - //There is no residual flow - break; - } - //Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); - - lifo.push_back((idsource, flow_upper_bound)); - - while let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == idsink { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i32; - let id_rev = self.graph[id][nbd].dest; - let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i32; - } - lifo.push_back((idsource, flow_upper_bound)); - continue; - } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= self.graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { - next_nbd[*parent] += 1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f as i32, self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow) as u32; - if new_flow == 0 { - next_nbd[id] += 1; - continue; - } - if let (Some(lvldest), Some(lvlid)) = - (level[self.graph[id][nbd].dest], level[id]){ - if lvldest <= lvlid { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - } - //otherwise, we send flow to nbd. - lifo.push_back((self.graph[id][nbd].dest, new_flow)); - } - } - Ok(()) - } - - //This function takes a flow, and a cost function on the edges, and tries to find an - // equivalent flow with a better cost, by finding improving overflow cycles. It uses - // as subroutine the Bellman Ford algorithm run up to path_length. - // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only - // one needs to be present in the cost function. - pub fn optimize_flow_with_cost(&mut self , cost: &CostFunction, path_length: usize ) - -> Result<(),String>{ - //We build the weighted graph g where we will look for negative cycle - let mut gf = self.build_cost_graph(cost)?; - let mut cycles = gf.list_negative_cycles(path_length); - while !cycles.is_empty() { - //we enumerate negative cycles - for c in cycles.iter(){ - for i in 0..c.len(){ - //We add one flow unit to the edge (u,v) of cycle c - let idu = self.vertextoid[&c[i]]; - let idv = self.vertextoid[&c[(i+1)%c.len()]]; - for j in 0..self.graph[idu].len(){ - //since idu appears at most once in the cycles, we enumerate every - //edge at most once. - let edge = self.graph[idu][j]; - if edge.dest == idv { - self.graph[idu][j].flow += 1; - self.graph[idv][edge.rev].flow -=1; - break; - } - } - } - } - - gf = self.build_cost_graph(cost)?; - cycles = gf.list_negative_cycles(path_length); - } - Ok(()) - } - - //Construct the weighted graph G_f from the flow and the cost function - fn build_cost_graph(&self , cost: &CostFunction) -> Result,String>{ - - let mut g = Graph::::new(&self.idtovertex); - let nb_vertices = self.idtovertex.len(); - for i in 0..nb_vertices { - for edge in self.graph[i].iter() { - if edge.cap as i32 -edge.flow > 0 { - //It is possible to send overflow through this edge - let u = self.idtovertex[i]; - let v = self.idtovertex[edge.dest]; - if cost.contains_key(&(u,v)) { - g.add_edge(u,v, cost[&(u,v)])?; - } - else if cost.contains_key(&(v,u)) { - g.add_edge(u,v, -cost[&(v,u)])?; - } - else{ - g.add_edge(u,v, 0)?; - } - } - } - } - Ok(g) - - } - - +pub type CostFunction = HashMap<(Vertex, Vertex), i32>; + +impl Graph { + pub fn new(vertices: &[Vertex]) -> Self { + let mut map = HashMap::::new(); + for (i, vert) in vertices.iter().enumerate() { + map.insert(*vert, i); + } + Graph:: { + vertextoid: map, + idtovertex: vertices.to_vec(), + graph: vec![Vec::::new(); vertices.len()], + } + } } -impl Graph{ - //This function adds a single directed weighted edge to the graph. - pub fn add_edge(&mut self, u: Vertex, v:Vertex, w: i32) -> Result<(), String>{ - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; - self.graph[idu].push( WeightedEdge{ w , dest: idv} ); - Ok(()) - } - - //This function lists the negative cycles it manages to find after path_length - //iterations of the main loop of the Bellman-Ford algorithm. For the classical - //algorithm, path_length needs to be equal to the number of vertices. However, - //for particular graph structures like our case, the algorithm is still correct - //when path_length is the length of the longest possible simple path. - //See the formal description of the algorithm for more details. - fn list_negative_cycles(&self, path_length: usize) -> Vec< Vec > { - - let nb_vertices = self.graph.len(); - - //We start with every vertex at distance 0 of some imaginary extra -1 vertex. - let mut distance = vec![0 ; nb_vertices]; - //The prev vector collects for every vertex from where does the shortest path come - let mut prev = vec![None; nb_vertices]; - - for _ in 0..path_length +1 { - for id in 0..nb_vertices{ - for e in self.graph[id].iter(){ - if distance[id] + e.w < distance[e.dest] { - distance[e.dest] = distance[id] + e.w; - prev[e.dest] = Some(id); - } - } - } - } - - - //If self.graph contains a negative cycle, then at this point the graph described - //by prev (which is a directed 1-forest/functional graph) - //must contain a cycle. We list the cycles of prev. - let cycles_prev = cycles_of_1_forest(&prev); - - //Remark that the cycle in prev is in the reverse order compared to the cycle - //in the graph. Thus the .rev(). - return cycles_prev.iter().map(|cycle| cycle.iter().rev().map( - |id| self.idtovertex[*id] - ).collect() ).collect(); - } - +impl Graph { + //This function adds a directed edge to the graph with capacity c, and the + //corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push(FlowEdge { + cap: c, + dest: idv, + flow: 0, + rev: rev_v, + }); + self.graph[idv].push(FlowEdge { + cap: 0, + dest: idu, + flow: 0, + rev: rev_u, + }); + Ok(()) + } + + //This function returns the list of vertices that receive a positive flow from + //vertex v. + pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = Vec::::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.idtovertex[edge.dest]); + } + } + Ok(result) + } + + //This function returns the value of the flow incoming to v. + pub fn get_inflow(&self, v: Vertex) -> Result { + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, self.graph[edge.dest][edge.rev].flow); + } + Ok(result) + } + + //This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self, v: Vertex) -> Result { + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, edge.flow); + } + Ok(result) + } + + //This function computes the flow total value by computing the outgoing flow + //from the source. + pub fn get_flow_value(&mut self) -> Result { + self.get_outflow(Vertex::Source) + } + + //This function shuffles the order of the edge lists. It keeps the ids of the + //reversed edges consistent. + fn shuffle_edges(&mut self) { + let mut rng = rand::thread_rng(); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + //Computes an upper bound of the flow n the graph + pub fn flow_upper_bound(&self) -> u32 { + let idsource = self.vertextoid[&Vertex::Source]; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter() { + flow_upper_bound += edge.cap; + } + flow_upper_bound + } + + //This function computes the maximal flow using Dinic's algorithm. It starts with + //the flow values already present in the graph. So it is possible to add some edge to + //the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + if !self.vertextoid.contains_key(&Vertex::Source) { + return Err("The graph does not contain a source.".to_string()); + } + if !self.vertextoid.contains_key(&Vertex::Sink) { + return Err("The graph does not contain a sink.".to_string()); + } + + let idsource = self.vertextoid[&Vertex::Source]; + let idsink = self.vertextoid[&Vertex::Sink]; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound(); + + //To ensure the dispersion of the associations generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + //We run Dinic's max flow algorithm + loop { + //We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while !fifo.is_empty() { + if let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { + //it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i32 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + } + if level[idsink] == None { + //There is no residual flow + break; + } + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + lifo.push_back((idsource, flow_upper_bound)); + + while let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == idsink { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i32; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i32; + } + lifo.push_back((idsource, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min( + f as i32, + self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow, + ) as u32; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } + if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { + if lvldest <= lvlid { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + //otherwise, we send flow to nbd. + lifo.push_back((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + //This function takes a flow, and a cost function on the edges, and tries to find an + // equivalent flow with a better cost, by finding improving overflow cycles. It uses + // as subroutine the Bellman Ford algorithm run up to path_length. + // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only + // one needs to be present in the cost function. + pub fn optimize_flow_with_cost( + &mut self, + cost: &CostFunction, + path_length: usize, + ) -> Result<(), String> { + //We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while !cycles.is_empty() { + //we enumerate negative cycles + for c in cycles.iter() { + for i in 0..c.len() { + //We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertextoid[&c[i]]; + let idv = self.vertextoid[&c[(i + 1) % c.len()]]; + for j in 0..self.graph[idu].len() { + //since idu appears at most once in the cycles, we enumerate every + //edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -= 1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + Ok(()) + } + + //Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { + let mut g = Graph::::new(&self.idtovertex); + let nb_vertices = self.idtovertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i32 - edge.flow > 0 { + //It is possible to send overflow through this edge + let u = self.idtovertex[i]; + let v = self.idtovertex[edge.dest]; + if cost.contains_key(&(u, v)) { + g.add_edge(u, v, cost[&(u, v)])?; + } else if cost.contains_key(&(v, u)) { + g.add_edge(u, v, -cost[&(v, u)])?; + } else { + g.add_edge(u, v, 0)?; + } + } + } + } + Ok(g) + } } +impl Graph { + //This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + self.graph[idu].push(WeightedEdge { w, dest: idv }); + Ok(()) + } + + //This function lists the negative cycles it manages to find after path_length + //iterations of the main loop of the Bellman-Ford algorithm. For the classical + //algorithm, path_length needs to be equal to the number of vertices. However, + //for particular graph structures like our case, the algorithm is still correct + //when path_length is the length of the longest possible simple path. + //See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec> { + let nb_vertices = self.graph.len(); + + //We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0; nb_vertices]; + //The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length + 1 { + for id in 0..nb_vertices { + for e in self.graph[id].iter() { + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + //If self.graph contains a negative cycle, then at this point the graph described + //by prev (which is a directed 1-forest/functional graph) + //must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + //Remark that the cycle in prev is in the reverse order compared to the cycle + //in the graph. Thus the .rev(). + return cycles_prev + .iter() + .map(|cycle| cycle.iter().rev().map(|id| self.idtovertex[*id]).collect()) + .collect(); + } +} //This function returns the list of cycles of a directed 1 forest. It does not //check for the consistency of the input. -fn cycles_of_1_forest(forest: &[Option]) -> Vec> { - let mut cycles = Vec::>::new(); - let mut time_of_discovery = vec![None; forest.len()]; - - for t in 0..forest.len(){ - let mut id = t; - //while we are on a valid undiscovered node - while time_of_discovery[id] == None { - time_of_discovery[id] = Some(t); - if let Some(i) = forest[id] { - id = i; - } - else{ - break; - } - } - if forest[id] != None && time_of_discovery[id] == Some(t) { - //We discovered an id that we explored at this iteration t. - //It means we are on a cycle - let mut cy = vec![id; 1]; - let mut id2 = id; - while let Some(id_next) = forest[id2] { - id2 = id_next; - if id2 != id { - cy.push(id2); - } - else { - break; - } - } - cycles.push(cy); - } - } - cycles +fn cycles_of_1_forest(forest: &[Option]) -> Vec> { + let mut cycles = Vec::>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len() { + let mut id = t; + //while we are on a valid undiscovered node + while time_of_discovery[id] == None { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } else { + break; + } + } + if forest[id] != None && time_of_discovery[id] == Some(t) { + //We discovered an id that we explored at this iteration t. + //It means we are on a cycle + let mut cy = vec![id; 1]; + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; + if id2 != id { + cy.push(id2); + } else { + break; + } + } + cycles.push(cy); + } + } + cycles } - - diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 976f94af..3a6f42ee 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap, Lww}; +use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::error::*; @@ -27,11 +27,11 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, - - //This attribute is only used to retain the previously computed partition size, - //to know to what extent does it change with the layout update. - pub partition_size: u32, - pub parameters: LayoutParameters, + + //This attribute is only used to retain the previously computed partition size, + //to know to what extent does it change with the layout update. + pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -39,7 +39,7 @@ pub struct ClusterLayout { /// in the system (this includes gateway nodes). /// The order here is different than the vec stored by `roles`, because: /// 1. non-gateway nodes are first so that they have lower numbers holding - /// in u8 (the number of non-gateway nodes is at most 256). + /// in u8 (the number of non-gateway nodes is at most 256). /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, @@ -49,7 +49,7 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - pub staged_parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } @@ -58,14 +58,14 @@ pub struct ClusterLayout { ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { - pub zone_redundancy:usize, + pub zone_redundancy: usize, } impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -96,27 +96,28 @@ impl NodeRole { } } - pub fn tags_string(&self) -> String { - let mut tags = String::new(); - if self.tags.is_empty() { - return tags - } - tags.push_str(&self.tags[0].clone()); - for t in 1..self.tags.len(){ - tags.push(','); - tags.push_str(&self.tags[t].clone()); - } - tags - } + pub fn tags_string(&self) -> String { + let mut tags = String::new(); + if self.tags.is_empty() { + return tags; + } + tags.push_str(&self.tags[0].clone()); + for t in 1..self.tags.len() { + tags.push(','); + tags.push_str(&self.tags[t].clone()); + } + tags + } } impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - - //We set the default zone redundancy to be equal to the replication factor, - //i.e. as strict as possible. - let parameters = LayoutParameters{ zone_redundancy: replication_factor}; - let staged_parameters = Lww::::new(parameters.clone()); + //We set the default zone redundancy to be equal to the replication factor, + //i.e. as strict as possible. + let parameters = LayoutParameters { + zone_redundancy: replication_factor, + }; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -124,12 +125,12 @@ impl ClusterLayout { ClusterLayout { version: 0, replication_factor, - partition_size: 0, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters, - staged_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -142,11 +143,10 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); - self.staged_parameters.merge(&other.staged_parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); - let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); let stage_changed = new_staging_hash != self.staging_hash; @@ -158,7 +158,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self,Message), Error> { + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { match version { None => { let error = r#" @@ -177,14 +177,14 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging); self.roles.retain(|(_, _, v)| v.0.is_some()); - let msg = self.calculate_partition_assignation()?; + let msg = self.calculate_partition_assignation()?; self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); self.version += 1; - Ok((self,msg)) + Ok((self, msg)) } pub fn revert_staged_changes(mut self, version: Option) -> Result { @@ -229,44 +229,52 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn useful_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity != None => result.push(*uuid), - _ => () - } - } - result - } - - ///Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid : &Uuid) -> Result { - match self.node_role(uuid) { - Some(role) => Ok(role.zone.clone()), - _ => Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) - } - } - - ///Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { - match self.node_role(uuid) { - Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => Ok(*cap), - _ => Err(Error::Message("The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity.".into())) - } - } - - ///Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { - let mut total_capacity = 0; - for uuid in self.useful_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; - } - Ok(total_capacity) - } + ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn useful_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => (), + } + } + result + } + + ///Given a node uuids, this function returns the label of its zone + pub fn get_node_zone(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + ///Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + ///Returns the sum of capacities of non gateway nodes in the cluster + pub fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.useful_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } /// Check a cluster layout for internal consistency /// returns true if consistent, false if error @@ -311,580 +319,689 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that every partition is associated to distinct nodes - let rf = self.replication_factor; - for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf*p..rf*(p+1)].to_vec(); - if nodes_of_p.iter().unique().count() != rf { - return false; - } - //Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p.iter() - .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.")); - let redundancy = self.parameters.zone_redundancy; - if zones_of_p.unique().count() < redundancy { - return false; - } - } - - //Check that the nodes capacities is consistent with the stored partitions - let mut node_usage = vec![0; MAX_NODE_NUMBER]; - for n in self.ring_assignation_data.iter() { - node_usage[*n as usize] += 1; - } - for (n, usage) in node_usage.iter().enumerate(){ - if *usage > 0 { - let uuid = self.node_id_vec[n]; - if usage*self.partition_size > self.get_node_capacity(&uuid) - .expect("Critical Error"){ - return false; - } - } - } - - //Check that the partition size stored is the one computed by the asignation - //algorithm. - let cl2 = self.clone(); - let (_ , zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); - let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); - if partition_size != self.partition_size { - return false; - } + //Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return false; + } + //Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p.iter().map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }); + let redundancy = self.parameters.zone_redundancy; + if zones_of_p.unique().count() < redundancy { + return false; + } + } + + //Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignation_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + if usage * self.partition_size + > self.get_node_capacity(&uuid).expect("Critical Error") + { + return false; + } + } + } + //Check that the partition size stored is the one computed by the asignation + //algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); + let partition_size = cl2 + .compute_optimal_partition_size(&zone_to_id) + .expect("Critical Error"); + if partition_size != self.partition_size { + return false; + } true } - } impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor - /// and the zone redundancy parameter It maximizes the capacity of a + /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - /// Staged changes must be merged with nodes roles before calling this function. - pub fn calculate_partition_assignation(&mut self) -> Result { + /// Staged changes must be merged with nodes roles before calling this function. + pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with the new ids - let old_assignation_opt = self.update_node_id_vec()?; - - let redundancy = self.staged_parameters.get().zone_redundancy; - - - let mut msg = Message::new(); - msg.push(format!("Computation of a new cluster layout where partitions are \ - replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); - - //We generate for once numerical ids for the zones of non gateway nodes, - //to use them as indices in the flow graphs. - let (id_to_zone , zone_to_id) = self.generate_useful_zone_ids()?; - - let nb_useful_nodes = self.useful_nodes().len(); - msg.push(format!("The cluster contains {} nodes spread over {} zones.", - nb_useful_nodes, id_to_zone.len())); - if nb_useful_nodes < self.replication_factor{ - return Err(Error::Message(format!("The number of nodes with positive \ + + //We update the node ids, since the node role list might have changed with the + //changes in the layout. We retrieve the old_assignation reframed with the new ids + let old_assignation_opt = self.update_node_id_vec()?; + + let redundancy = self.staged_parameters.get().zone_redundancy; + + let mut msg = Message::new(); + msg.push(format!( + "Computation of a new cluster layout where partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, redundancy + )); + + //We generate for once numerical ids for the zones of non gateway nodes, + //to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; + + let nb_useful_nodes = self.useful_nodes().len(); + msg.push(format!( + "The cluster contains {} nodes spread over {} zones.", + nb_useful_nodes, + id_to_zone.len() + )); + if nb_useful_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_useful_nodes, self.replication_factor))); - } - if id_to_zone.len() < redundancy { - return Err(Error::Message(format!("The number of zones with non-gateway \ + nb_useful_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", - id_to_zone.len() , redundancy))); - } - - //We compute the optimal partition size - //Capacities should be given in a unit so that partition size is at least 100. - //In this case, integer rounding plays a marginal role in the percentages of - //optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; - - if old_assignation_opt != None { - msg.push(format!("Given the replication and redundancy constraint, the \ + id_to_zone.len(), + redundancy + ))); + } + + //We compute the optimal partition size + //Capacities should be given in a unit so that partition size is at least 100. + //In this case, integer rounding plays a marginal role in the percentages of + //optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + + if old_assignation_opt != None { + msg.push(format!( + "Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {} (the zone redundancy was {}).", partition_size, self.partition_size, - self.parameters.zone_redundancy)); - } - else { - msg.push(format!("Given the replication and redundancy constraints, the \ - optimal size of a partition is {}.", partition_size)); - } - self.partition_size = partition_size; - self.parameters = self.staged_parameters.get().clone(); - - if partition_size < 100 { - msg.push("WARNING: The partition size is low (< 100), you might consider to \ - provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb).".into()); - } - - //We compute a first flow/assignment that is heuristically close to the previous - //assignment - let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; - } - - msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); - msg.push("".to_string()); - - //We update the layout structure - self.update_ring_from_flow(id_to_zone.len() , &gflow)?; - Ok(msg) - } + be {} (the zone redundancy was {}).", + partition_size, self.partition_size, self.parameters.zone_redundancy + )); + } else { + msg.push(format!( + "Given the replication and redundancy constraints, the \ + optimal size of a partition is {}.", + partition_size + )); + } + self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), you might consider to \ + provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb)." + .into(), + ); + } + + //We compute a first flow/assignment that is heuristically close to the previous + //assignment + let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignation_opt)?; + if let Some(assoc) = &old_assignation_opt { + //We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } + + msg.append(&mut self.output_stat( + &gflow, + &old_assignation_opt, + &zone_to_id, + &id_to_zone, + )?); + msg.push("".to_string()); + + //We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + Ok(msg) + } /// The LwwMap of node roles might have changed. This function updates the node_id_vec /// and returns the assignation given by ring, with the new indices of the nodes, and /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,Error> { - // (1) We compute the new node list - //Non gateway nodes should be coded on 8bits, hence they must be first in the list - //We build the new node ids - let mut new_non_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) - .map(|(k, _, _)| *k).collect(); - - if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER) )); - } - - let mut new_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) - .map(|(k, _, _)| *k).collect(); - - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.append(&mut new_non_gateway_nodes); - new_node_id_vec.append(&mut new_gateway_nodes); - - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); - - // (2) We retrieve the old association - //We rewrite the old association with the new indices. We only consider partition - //to node assignations where the node is still in use. - let nb_partitions = 1usize << PARTITION_BITS; - let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; - - if self.ring_assignation_data.is_empty() { - //This is a new association - return Ok(None); - } - if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err(Error::Message("The old assignation does not have a size corresponding to \ - the old replication factor or the number of partitions.".into())); - } - - //We build a translation table between the uuid and new ids - let mut uuid_to_new_id = HashMap::::new(); - - //We add the indices of only the new non-gateway nodes that can be used in the - //association ring - for (i, uuid) in new_node_id_vec.iter().enumerate() { - uuid_to_new_id.insert(*uuid, i ); - } - - let rf= self.replication_factor; - for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { - for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { - let uuid = old_node_id_vec[*old_id as usize]; - if uuid_to_new_id.contains_key(&uuid) { - old_assign_p.push(uuid_to_new_id[&uuid]); - } - } - } - - //We write the ring - self.ring_assignation_data = Vec::::new(); - - if !self.check() { - return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); - } - - Ok(Some(old_assignation)) + fn update_node_id_vec(&mut self) -> Result>>, Error> { + // (1) We compute the new node list + //Non gateway nodes should be coded on 8bits, hence they must be first in the list + //We build the new node ids + let mut new_non_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); + } + + let mut new_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.append(&mut new_non_gateway_nodes); + new_node_id_vec.append(&mut new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + //We rewrite the old association with the new indices. We only consider partition + //to node assignations where the node is still in use. + let nb_partitions = 1usize << PARTITION_BITS; + let mut old_assignation = vec![Vec::::new(); nb_partitions]; + + if self.ring_assignation_data.is_empty() { + //This is a new association + return Ok(None); + } + if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + return Err(Error::Message( + "The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + //We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + //We add the indices of only the new non-gateway nodes that can be used in the + //association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); + } + + let rf = self.replication_factor; + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { + for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } + + //We write the ring + self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message( + "Critical error: The computed layout happens to be incorrect".into(), + )); + } + + Ok(Some(old_assignation)) } + ///This function generates ids for the zone of the nodes appearing in + ///self.node_id_vec. + fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.useful_nodes().iter() { + if self.roles.get(uuid) == None { + return Err(Error::Message( + "The uuid was not found in the node roles (this should \ + not happen, it might be a critical error)." + .into(), + )); + } + if let Some(r) = self.node_role(uuid) { + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + } + Ok((id_to_zone, zone_to_id)) + } - ///This function generates ids for the zone of the nodes appearing in - ///self.node_id_vec. - fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ - let mut id_to_zone = Vec::::new(); - let mut zone_to_id = HashMap::::new(); - - for uuid in self.useful_nodes().iter() { - if self.roles.get(uuid) == None { - return Err(Error::Message("The uuid was not found in the node roles (this should \ - not happen, it might be a critical error).".into())); - } - if let Some(r) = self.node_role(uuid) { - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone() , id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - } - } - Ok((id_to_zone, zone_to_id)) - } - - ///This function computes by dichotomy the largest realizable partition size, given - ///the layout. - fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ - let nb_partitions = 1usize << PARTITION_BITS; - let empty_set = HashSet::<(usize,usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err(Error::Message("The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1.".into())); - } - - let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; - while s_down +1 < s_up { - g = self.generate_flow_graph((s_down+s_up)/2, zone_to_id, &empty_set)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - s_up = (s_down+s_up)/2; - } - else { - s_down = (s_down+s_up)/2; - } - } - - Ok(s_down) - } - - fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { - let mut vertices = vec![Vertex::Source, Vertex::Sink]; - for p in 0..NB_PARTITIONS { - vertices.push(Vertex::Pup(p)); - vertices.push(Vertex::Pdown(p)); - for z in 0..nb_zones { - vertices.push(Vertex::PZ(p, z)); - } - } - for n in 0..nb_nodes { - vertices.push(Vertex::N(n)); - } - vertices - } - - fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { - let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), - self.useful_nodes().len()); - let mut g= Graph::::new(&vertices); - let nb_zones = zone_to_id.len(); - let redundancy = self.staged_parameters.get().zone_redundancy; - for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; - g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; - for z in 0..nb_zones { - g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; - g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , - self.replication_factor as u32)?; - } - } - for n in 0..self.useful_nodes().len() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity/size)?; - for p in 0..NB_PARTITIONS { - if !exclude_assoc.contains(&(p,n)) { - g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; - } - } - } - Ok(g) - } - - - fn compute_candidate_assignment(&self, zone_to_id: &HashMap, - old_assoc_opt : &Option >>) -> Result, Error > { - - //We list the edges that are not used in the old association - let mut exclude_edge = HashSet::<(usize,usize)>::new(); - if let Some(old_assoc) = old_assoc_opt { - let nb_nodes = self.useful_nodes().len(); - for (p, old_assoc_p) in old_assoc.iter().enumerate() { - for n in 0..nb_nodes { - exclude_edge.insert((p,n)); - } - for n in old_assoc_p.iter() { - exclude_edge.remove(&(p,*n)); - } - } - } - - //We compute the best flow using only the edges used in the old assoc - let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge )?; - g.compute_maximal_flow()?; - for (p,n) in exclude_edge.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; - g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; - } - g.compute_maximal_flow()?; - Ok(g) - } - - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &[Vec ]) -> Result<(), Error > { - let mut cost = CostFunction::new(); - for (p, assoc_p) in old_assoc.iter().enumerate(){ - for n in assoc_p.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; - cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); - } - } - let nb_nodes = self.useful_nodes().len(); - let path_length = 4*nb_nodes; - gflow.optimize_flow_with_cost(&cost, path_length)?; - - Ok(()) - } - - fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ - self.ring_assignation_data = Vec::::new(); - for p in 0..NB_PARTITIONS { - for z in 0..nb_zones { - let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - for vertex in assoc_vertex.iter() { - if let Vertex::N(n) = vertex { - self.ring_assignation_data.push((*n).try_into().unwrap()); - } - } - } - } - - if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err(Error::Message("Critical Error : the association ring we produced does not \ - have the right size.".into())); - } - Ok(()) - } - - - //This function returns a message summing up the partition repartition of the new - //layout. - fn output_stat(&self , gflow : &Graph, - old_assoc_opt : &Option< Vec> >, - zone_to_id: &HashMap, - id_to_zone : &[String]) -> Result{ - let mut msg = Message::new(); - + ///This function computes by dichotomy the largest realizable partition size, given + ///the layout. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap, + ) -> Result { let nb_partitions = 1usize << PARTITION_BITS; - let used_cap = self.partition_size * nb_partitions as u32 * - self.replication_factor as u32; - let total_cap = self.get_total_capacity()?; - let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); - msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", - used_cap , total_cap , percent_cap )); - msg.push("".into()); - msg.push("If the percentage is to low, it might be that the \ + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? + < (nb_partitions * self.replication_factor) + .try_into() + .unwrap() + { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? + < (nb_partitions * self.replication_factor) + .try_into() + .unwrap() + { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } + + Ok(s_down) + } + + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } + + fn generate_flow_graph( + &self, + size: u32, + zone_to_id: &HashMap, + exclude_assoc: &HashSet<(usize, usize)>, + ) -> Result, Error> { + let vertices = + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); + let mut g = Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + let redundancy = self.staged_parameters.get().zone_redundancy; + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - redundancy) as u32, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u32, + )?; + } + } + for n in 0..self.useful_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + Ok(g) + } + + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap, + old_assoc_opt: &Option>>, + ) -> Result, Error> { + //We list the edges that are not used in the old association + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(old_assoc) = old_assoc_opt { + let nb_nodes = self.useful_nodes().len(); + for (p, old_assoc_p) in old_assoc.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in old_assoc_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } + + //We compute the best flow using only the edges used in the old assoc + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + g.compute_maximal_flow()?; + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) + } + + fn minimize_rebalance_load( + &self, + gflow: &mut Graph, + zone_to_id: &HashMap, + old_assoc: &[Vec], + ) -> Result<(), Error> { + let mut cost = CostFunction::new(); + for (p, assoc_p) in old_assoc.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); + } + } + let nb_nodes = self.useful_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + Ok(()) + } + + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph, + ) -> Result<(), Error> { + self.ring_assignation_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignation_data.push((*n).try_into().unwrap()); + } + } + } + } + + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) + } + + //This function returns a message summing up the partition repartition of the new + //layout. + fn output_stat( + &self, + gflow: &Graph, + old_assoc_opt: &Option>>, + zone_to_id: &HashMap, + id_to_zone: &[String], + ) -> Result { + let mut msg = Message::new(); + + let nb_partitions = 1usize << PARTITION_BITS; + let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push(format!( + "Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + used_cap, total_cap, percent_cap + )); + msg.push("".into()); + msg.push( + "If the percentage is to low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones.".into()); - msg.push(format!("Recall that because of the replication factor, the actual available \ - storage capacity is {} / {} = {}.", - used_cap , self.replication_factor , - used_cap/self.replication_factor as u32)); - - //We define and fill in the following tables - let storing_nodes = self.useful_nodes(); - let mut new_partitions = vec![0; storing_nodes.len()]; - let mut stored_partitions = vec![0; storing_nodes.len()]; - - let mut new_partitions_zone = vec![0; id_to_zone.len()]; - let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - - for p in 0..nb_partitions { - for z in 0..id_to_zone.len() { - let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - if !pz_nodes.is_empty() { - stored_partitions_zone[z] += 1; - if let Some(old_assoc) = old_assoc_opt { - let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { - old_zones_of_p.push( - zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } - } - for vert in pz_nodes.iter() { - if let Vertex::N(n) = *vert { - stored_partitions[n] += 1; - if let Some(old_assoc) = old_assoc_opt { - if !old_assoc[p].contains(&n) { - new_partitions[n] += 1; - } - } - } - } - } - } - - if *old_assoc_opt == None { - new_partitions = stored_partitions.clone(); - new_partitions_zone = stored_partitions_zone.clone(); - } - - //We display the statistics - - msg.push("".into()); - if *old_assoc_opt != None { - let total_new_partitions : usize = new_partitions.iter().sum(); - msg.push(format!("A total of {} new copies of partitions need to be \ - transferred.", total_new_partitions)); - } - msg.push("".into()); - msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); - - for z in 0..id_to_zone.len(){ - let mut nodes_of_z = Vec::::new(); - for n in 0..storing_nodes.len(){ - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { - nodes_of_z.push(n); - } - } - let replicated_partitions : usize = nodes_of_z.iter() - .map(|n| stored_partitions[*n]).sum(); - msg.push("".into()); - - msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ - {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], - new_partitions_zone[z], replicated_partitions)); - - let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; - let mut total_cap_z = 0; - for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; - } - let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); - msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", - available_cap_z, total_cap_z, percent_cap_z)); - - for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; - let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self.node_role(&self.node_id_vec[*n]) - .ok_or("Node not found."))?.tags_string(); - msg.push(format!(" Node {}: {} partitions ({} new) ; \ - available/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec()[0..2].to_vec().encode_hex::(), - stored_partitions[*n], - new_partitions[*n], available_cap_n, total_cap_n, - (available_cap_n as f32)/(total_cap_n as f32)*100.0 , - tags_n)); - } - } - - Ok(msg) - } - + See the detailed statistics below and look for saturated nodes/zones." + .into(), + ); + msg.push(format!( + "Recall that because of the replication factor, the actual available \ + storage capacity is {} / {} = {}.", + used_cap, + self.replication_factor, + used_cap / self.replication_factor as u32 + )); + + //We define and fill in the following tables + let storing_nodes = self.useful_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..nb_partitions { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p + .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(old_assoc) = old_assoc_opt { + if !old_assoc[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if *old_assoc_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } + + //We display the statistics + + msg.push("".into()); + if *old_assoc_opt != None { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + } + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); + + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + msg.push("".into()); + + msg.push(format!( + "Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", + id_to_zone[z], + stored_partitions_zone[z], + new_partitions_zone[z], + replicated_partitions + )); + + let available_cap_z: u32 = self.partition_size * replicated_partitions as u32; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + msg.push(format!( + " Available capacity / Total capacity: {}/{} ({:.1}%).", + available_cap_z, total_cap_z, percent_cap_z + )); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u32 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self + .node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))? + .tags_string(); + msg.push(format!( + " Node {}: {} partitions ({} new) ; \ + available/total capacity: {} / {} ({:.1}%) ; tags:{}", + &self.node_id_vec[*n].to_vec()[0..2] + .to_vec() + .encode_hex::(), + stored_partitions[*n], + new_partitions[*n], + available_cap_n, + total_cap_n, + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + tags_n + )); + } + } + + Ok(msg) + } } //==================================================================================== #[cfg(test)] mod tests { - use super::{*,Error}; - use std::cmp::min; - - - //This function checks that the partition size S computed is at least better than the - //one given by a very naive algorithm. To do so, we try to run the naive algorithm - //assuming a partion size of S+1. If we succed, it means that the optimal assignation - //was not optimal. The naive algorithm is the following : - //- we compute the max number of partitions associated to every node, capped at the - //partition number. It gives the number of tokens of every node. - //- every zone has a number of tokens equal to the sum of the tokens of its nodes. - //- we cycle over the partitions and associate zone tokens while respecting the - //zone redundancy constraint. - //NOTE: the naive algorithm is not optimal. Counter example: - //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - //With these parameters, the naive algo fails, whereas there is a solution: - //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &ClusterLayout) -> Result { - let over_size = cl.partition_size +1; - let mut zone_token = HashMap::::new(); - let nb_partitions = 1usize << PARTITION_BITS; - - let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.useful_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; - zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); - } - - //For every partition, we count the number of zone already associated and - //the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z,t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; nb_partitions]; - let mut last_zone = vec![zones.len(); nb_partitions]; - - let mut curr_zone = 0; - - let redundancy = cl.parameters.zone_redundancy; - - for replic in 0..cl.replication_factor { - for p in 0..nb_partitions { - while id_zone_token[curr_zone] == 0 || - (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg : &Message) { - for s in msg.iter(){ - println!("{}",s); - } - } + use super::{Error, *}; + use std::cmp::min; + + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(nb_partitions, (c / over_size) as usize), + ); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } + + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } + } fn update_layout( cl: &mut ClusterLayout, node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, - zone_redundancy: usize + zone_redundancy: usize, ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -901,12 +1018,12 @@ mod tests { ); cl.roles.merge(&update); } - cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); + cl.staged_parameters = Lww::::new(LayoutParameters { zone_redundancy }); } #[test] fn test_assignation() { - let mut node_id_vec = vec![1, 2, 3]; + let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] .into_iter() @@ -936,11 +1053,12 @@ mod tests { assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); - node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); - } } diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 1036a8e1..17e92dd7 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -7,12 +7,11 @@ mod consul; #[cfg(feature = "kubernetes-discovery")] mod kubernetes; -pub mod layout; pub mod graph_algo; +pub mod layout; pub mod ring; pub mod system; - mod metrics; pub mod rpc_helper; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 655d21de..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,7 +565,6 @@ impl System { return Err(Error::Message(msg)); } - let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From e5664c9822c6ed1ecb30cac41b6a4125da3f88e7 Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 11 Oct 2022 17:17:13 +0200 Subject: Improved the statistics displayed in layout show corrected a few bugs --- src/garage/cli/layout.rs | 69 ++++++++++++++++++++----------- src/rpc/layout.rs | 105 +++++++++++++++++++++++++++++------------------ 2 files changed, 111 insertions(+), 63 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index f747fbe4..5056e57d 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -169,7 +169,7 @@ pub async fn cmd_show_layout( rpc_cli: &Endpoint, rpc_host: NodeID, ) -> Result<(), Error> { - let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); if !print_cluster_layout(&layout) { @@ -179,41 +179,40 @@ pub async fn cmd_show_layout( println!(); println!("Current cluster layout version: {}", layout.version); - if print_staging_role_changes(&layout) { - layout.roles.merge(&layout.staging); - - println!(); - println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - if !print_cluster_layout(&layout) { - println!("No nodes have a role in the new layout."); - } - println!(); - - println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!( - "Zone redundancy: {}", - layout.staged_parameters.get().zone_redundancy - ); - println!(); + let has_role_changes = print_staging_role_changes(&layout); + let has_param_changes = print_staging_parameters_changes(&layout); + if has_role_changes || has_param_changes { + let v = layout.version; + let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions // will move around when we apply - match layout.calculate_partition_assignation() { - Ok(msg) => { + match res_apply { + Ok((layout, msg)) => { + println!(); + println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); + if !print_cluster_layout(&layout) { + println!("No nodes have a role in the new layout."); + } + println!(); + for line in msg.iter() { println!("{}", line); } println!("To enact the staged role changes, type:"); println!(); - println!(" garage layout apply --version {}", layout.version + 1); + println!(" garage layout apply --version {}", v + 1); println!(); println!( "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1) + v + 1) } Err(Error::Message(s)) => { println!("Error while trying to compute the assignation: {}", s); println!("This new layout cannot yet be applied."); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + v + 1) } _ => { println!("Unknown Error"); @@ -321,21 +320,29 @@ pub async fn send_layout( } pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { - let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; + let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { Some(r) => r, _ => continue, }; let tags = role.tags.join(","); + let usage = layout.get_node_usage(id).unwrap_or(0); + let capacity = layout.get_node_capacity(id).unwrap_or(1); table.push(format!( - "{:?}\t{}\t{}\t{}", + "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", id, tags, role.zone, - role.capacity_string() + role.capacity_string(), + usage as u32 * layout.partition_size, + (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) )); } + println!(); + println!("Parameters of the layout computation:"); + println!("Zone redundancy: {}", layout.parameters.zone_redundancy); + println!(); if table.len() == 1 { false } else { @@ -344,6 +351,20 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } } +pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { + let has_changes = layout.staged_parameters.get().clone() != layout.parameters; + if has_changes { + println!(); + println!("==== NEW LAYOUT PARAMETERS ===="); + println!( + "Zone redundancy: {}", + layout.staged_parameters.get().zone_redundancy + ); + println!(); + } + has_changes +} + pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { let has_changes = layout .staging diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 3a6f42ee..d2ed8af8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -205,6 +205,7 @@ To know the correct value of the new layout version, invoke `garage layout show` self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + self.staged_parameters.update(self.parameters.clone()); self.version += 1; @@ -267,6 +268,26 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + ///Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignation_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + ///Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; @@ -357,11 +378,10 @@ To know the correct value of the new layout version, invoke `garage layout show` //algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); - let partition_size = cl2 - .compute_optimal_partition_size(&zone_to_id) - .expect("Critical Error"); - if partition_size != self.partition_size { - return false; + match cl2.compute_optimal_partition_size(&zone_to_id) { + Ok(s) if s != self.partition_size => return false, + Err(_) => return false, + _ => (), } true @@ -376,8 +396,9 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - /// Staged changes must be merged with nodes roles before calling this function. - pub fn calculate_partition_assignation(&mut self) -> Result { + // Staged role changes must be merged with nodes roles before calling this function, + // hence it must only be called from apply_staged_changes() and it is not public. + fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. @@ -386,13 +407,15 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.staged_parameters.get().zone_redundancy; + self.parameters = self.staged_parameters.get().clone(); let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); msg.push(format!( - "Computation of a new cluster layout where partitions are \ + "Partitions are \ replicated {} times on at least {} distinct zones.", - self.replication_factor, redundancy + self.replication_factor, self.parameters.zone_redundancy )); //We generate for once numerical ids for the zones of non gateway nodes, @@ -400,11 +423,6 @@ impl ClusterLayout { let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; let nb_useful_nodes = self.useful_nodes().len(); - msg.push(format!( - "The cluster contains {} nodes spread over {} zones.", - nb_useful_nodes, - id_to_zone.len() - )); if nb_useful_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ @@ -412,12 +430,12 @@ impl ClusterLayout { nb_useful_nodes, self.replication_factor ))); } - if id_to_zone.len() < redundancy { + if id_to_zone.len() < self.parameters.zone_redundancy { return Err(Error::Message(format!( "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", id_to_zone.len(), - redundancy + self.parameters.zone_redundancy ))); } @@ -429,10 +447,8 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!( - "Given the replication and redundancy constraint, the \ - optimal size of a partition is {}. In the previous layout, it used to \ - be {} (the zone redundancy was {}).", - partition_size, self.partition_size, self.parameters.zone_redundancy + "Optimal size of a partition: {} (was {} in the previous layout).", + partition_size, self.partition_size )); } else { msg.push(format!( @@ -442,7 +458,6 @@ impl ClusterLayout { )); } self.partition_size = partition_size; - self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push( @@ -470,6 +485,13 @@ impl ClusterLayout { //We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + + if !self.check() { + return Err(Error::Message( + "Critical error: The computed layout happens to be incorrect".into(), + )); + } + Ok(msg) } @@ -553,12 +575,6 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); - if !self.check() { - return Err(Error::Message( - "Critical error: The computed layout happens to be incorrect".into(), - )); - } - Ok(Some(old_assignation)) } @@ -652,7 +668,7 @@ impl ClusterLayout { ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.staged_parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge( @@ -774,8 +790,9 @@ impl ClusterLayout { let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push("".into()); msg.push(format!( - "Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap, total_cap, percent_cap )); msg.push("".into()); @@ -878,7 +895,7 @@ impl ClusterLayout { } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); msg.push(format!( - " Available capacity / Total capacity: {}/{} ({:.1}%).", + " Usable capacity / Total capacity: {}/{} ({:.1}%).", available_cap_z, total_cap_z, percent_cap_z )); @@ -891,7 +908,7 @@ impl ClusterLayout { .tags_string(); msg.push(format!( " Node {}: {} partitions ({} new) ; \ - available/total capacity: {} / {} ({:.1}%) ; tags:{}", + usable/total capacity: {} / {} ({:.1}%) ; tags:{}", &self.node_id_vec[*n].to_vec()[0..2] .to_vec() .encode_hex::(), @@ -1008,7 +1025,7 @@ mod tests { cl.node_id_vec.push(x); } - let update = cl.roles.update_mutator( + let update = cl.staging.update_mutator( cl.node_id_vec[i], NodeRoleV(Some(NodeRole { zone: (node_zone_vec[i].to_string()), @@ -1016,9 +1033,11 @@ mod tests { tags: (vec![]), })), ); - cl.roles.merge(&update); + cl.staging.merge(&update); } - cl.staged_parameters = Lww::::new(LayoutParameters { zone_redundancy }); + cl.staging_hash = blake2sum(&rmp_to_vec_all_named(&cl.staging).unwrap()[..]); + cl.staged_parameters + .update(LayoutParameters { zone_redundancy }); } #[test] @@ -1032,7 +1051,9 @@ mod tests { let mut cl = ClusterLayout::new(3); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); @@ -1043,13 +1064,17 @@ mod tests { .map(|x| x.to_string()) .collect(); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); @@ -1057,7 +1082,9 @@ mod tests { 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, ]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); } -- cgit v1.2.3 From bcdd1e0c3335500a6d0337ce6ee050fb59fc665a Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 11 Oct 2022 18:29:21 +0200 Subject: Added some comment --- src/rpc/graph_algo.rs | 79 ++++++++++++------------ src/rpc/layout.rs | 162 +++++++++++++++++++++++++++++--------------------- 2 files changed, 132 insertions(+), 109 deletions(-) (limited to 'src') diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 13c60692..5bd6cc51 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -6,10 +6,10 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::collections::VecDeque; -//Vertex data structures used in all the graphs used in layout.rs. -//usize parameters correspond to node/zone/partitions ids. -//To understand the vertex roles below, please refer to the formal description -//of the layout computation algorithm. +///Vertex data structures used in all the graphs used in layout.rs. +///usize parameters correspond to node/zone/partitions ids. +///To understand the vertex roles below, please refer to the formal description +///of the layout computation algorithm. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum Vertex { Source, @@ -20,8 +20,7 @@ pub enum Vertex { Sink, } -//Edge data structure for the flow algorithm. -//The graph is stored as an adjacency list +///Edge data structure for the flow algorithm. #[derive(Clone, Copy, Debug)] pub struct FlowEdge { cap: u32, //flow maximal capacity of the edge @@ -30,8 +29,7 @@ pub struct FlowEdge { rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v } -//Edge data structure for the detection of negative cycles. -//The graph is stored as a list of edges (u,v). +///Edge data structure for the detection of negative cycles. #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { w: i32, //weight of the edge @@ -42,13 +40,14 @@ pub trait Edge: Clone + Copy {} impl Edge for FlowEdge {} impl Edge for WeightedEdge {} -//Struct for the graph structure. We do encapsulation here to be able to both -//provide user friendly Vertex enum to address vertices, and to use usize indices -//and Vec instead of HashMap in the graph algorithm to optimize execution speed. +///Struct for the graph structure. We do encapsulation here to be able to both +///provide user friendly Vertex enum to address vertices, and to use internally usize +///indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. pub struct Graph { vertextoid: HashMap, idtovertex: Vec, + //The graph is stored as an adjacency list graph: Vec>, } @@ -69,8 +68,8 @@ impl Graph { } impl Graph { - //This function adds a directed edge to the graph with capacity c, and the - //corresponding reversed edge with capacity 0. + ///This function adds a directed edge to the graph with capacity c, and the + ///corresponding reversed edge with capacity 0. pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -94,8 +93,8 @@ impl Graph { Ok(()) } - //This function returns the list of vertices that receive a positive flow from - //vertex v. + ///This function returns the list of vertices that receive a positive flow from + ///vertex v. pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { if !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -110,7 +109,7 @@ impl Graph { Ok(result) } - //This function returns the value of the flow incoming to v. + ///This function returns the value of the flow incoming to v. pub fn get_inflow(&self, v: Vertex) -> Result { if !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -123,7 +122,7 @@ impl Graph { Ok(result) } - //This function returns the value of the flow outgoing from v. + ///This function returns the value of the flow outgoing from v. pub fn get_outflow(&self, v: Vertex) -> Result { if !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -136,14 +135,14 @@ impl Graph { Ok(result) } - //This function computes the flow total value by computing the outgoing flow - //from the source. + ///This function computes the flow total value by computing the outgoing flow + ///from the source. pub fn get_flow_value(&mut self) -> Result { self.get_outflow(Vertex::Source) } - //This function shuffles the order of the edge lists. It keeps the ids of the - //reversed edges consistent. + ///This function shuffles the order of the edge lists. It keeps the ids of the + ///reversed edges consistent. fn shuffle_edges(&mut self) { let mut rng = rand::thread_rng(); for i in 0..self.graph.len() { @@ -157,7 +156,7 @@ impl Graph { } } - //Computes an upper bound of the flow n the graph + ///Computes an upper bound of the flow on the graph pub fn flow_upper_bound(&self) -> u32 { let idsource = self.vertextoid[&Vertex::Source]; let mut flow_upper_bound = 0; @@ -167,9 +166,9 @@ impl Graph { flow_upper_bound } - //This function computes the maximal flow using Dinic's algorithm. It starts with - //the flow values already present in the graph. So it is possible to add some edge to - //the graph, compute a flow, add other edges, update the flow. + ///This function computes the maximal flow using Dinic's algorithm. It starts with + ///the flow values already present in the graph. So it is possible to add some edge to + ///the graph, compute a flow, add other edges, update the flow. pub fn compute_maximal_flow(&mut self) -> Result<(), String> { if !self.vertextoid.contains_key(&Vertex::Source) { return Err("The graph does not contain a source.".to_string()); @@ -270,11 +269,11 @@ impl Graph { Ok(()) } - //This function takes a flow, and a cost function on the edges, and tries to find an - // equivalent flow with a better cost, by finding improving overflow cycles. It uses - // as subroutine the Bellman Ford algorithm run up to path_length. - // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only - // one needs to be present in the cost function. + ///This function takes a flow, and a cost function on the edges, and tries to find an + /// equivalent flow with a better cost, by finding improving overflow cycles. It uses + /// as subroutine the Bellman Ford algorithm run up to path_length. + /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and + /// only one needs to be present in the cost function. pub fn optimize_flow_with_cost( &mut self, cost: &CostFunction, @@ -309,7 +308,7 @@ impl Graph { Ok(()) } - //Construct the weighted graph G_f from the flow and the cost function + ///Construct the weighted graph G_f from the flow and the cost function fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { let mut g = Graph::::new(&self.idtovertex); let nb_vertices = self.idtovertex.len(); @@ -334,7 +333,7 @@ impl Graph { } impl Graph { - //This function adds a single directed weighted edge to the graph. + ///This function adds a single directed weighted edge to the graph. pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -345,12 +344,12 @@ impl Graph { Ok(()) } - //This function lists the negative cycles it manages to find after path_length - //iterations of the main loop of the Bellman-Ford algorithm. For the classical - //algorithm, path_length needs to be equal to the number of vertices. However, - //for particular graph structures like our case, the algorithm is still correct - //when path_length is the length of the longest possible simple path. - //See the formal description of the algorithm for more details. + ///This function lists the negative cycles it manages to find after path_length + ///iterations of the main loop of the Bellman-Ford algorithm. For the classical + ///algorithm, path_length needs to be equal to the number of vertices. However, + ///for particular graph structures like in our case, the algorithm is still correct + ///when path_length is the length of the longest possible simple path. + ///See the formal description of the algorithm for more details. fn list_negative_cycles(&self, path_length: usize) -> Vec> { let nb_vertices = self.graph.len(); @@ -384,8 +383,8 @@ impl Graph { } } -//This function returns the list of cycles of a directed 1 forest. It does not -//check for the consistency of the input. +///This function returns the list of cycles of a directed 1 forest. It does not +///check for the consistency of the input. fn cycles_of_1_forest(forest: &[Option]) -> Vec> { let mut cycles = Vec::>::new(); let mut time_of_discovery = vec![None; forest.len()]; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d2ed8af8..38e56b88 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -17,6 +17,8 @@ use crate::ring::*; use std::convert::TryInto; +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + //The Message type will be used to collect information on the algorithm. type Message = Vec; @@ -28,9 +30,11 @@ pub struct ClusterLayout { pub replication_factor: usize, - //This attribute is only used to retain the previously computed partition size, - //to know to what extent does it change with the layout update. + ///This attribute is only used to retain the previously computed partition size, + ///to know to what extent does it change with the layout update. pub partition_size: u32, + ///Parameters used to compute the assignation currently given by + ///ring_assignation_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -48,8 +52,9 @@ pub struct ClusterLayout { #[serde(with = "serde_bytes")] pub ring_assignation_data: Vec, - /// Role changes which are staged for the next version of the layout + /// Parameters to be used in the next partition assignation computation. pub staged_parameters: Lww, + /// Role changes which are staged for the next version of the layout pub staging: LwwMap, pub staging_hash: Hash, } @@ -65,8 +70,6 @@ impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -77,12 +80,13 @@ impl AutoCrdt for NodeRoleV { /// The user-assigned roles of cluster nodes #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRole { - /// Datacenter at which this entry belong. This information might be used to perform a better - /// geodistribution + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution pub zone: String, - /// The (relative) capacity of the node + /// The capacity of the node /// If this is set to None, the node does not participate in storing data for the system /// and is only active as an API gateway to other nodes + // TODO : change the capacity to u64 and use byte unit input/output pub capacity: Option, /// A set of tags to recognize the node pub tags: Vec, @@ -110,6 +114,7 @@ impl NodeRole { } } +//Implementation of the ClusterLayout methods unrelated to the assignation algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { //We set the default zone redundancy to be equal to the replication factor, @@ -231,7 +236,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } ///Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn useful_nodes(&self) -> Vec { + pub fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { @@ -291,13 +296,14 @@ To know the correct value of the new layout version, invoke `garage layout show` ///Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; - for uuid in self.useful_nodes().iter() { + for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; } Ok(total_capacity) } /// Check a cluster layout for internal consistency + /// (assignation, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> bool { // Check that the hash of the staging data is correct @@ -377,7 +383,7 @@ To know the correct value of the new layout version, invoke `garage layout show` //Check that the partition size stored is the one computed by the asignation //algorithm. let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); match cl2.compute_optimal_partition_size(&zone_to_id) { Ok(s) if s != self.partition_size => return false, Err(_) => return false, @@ -388,6 +394,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } +//Implementation of the ClusterLayout methods related to the assignation algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor @@ -397,16 +404,13 @@ impl ClusterLayout { /// the former assignation (if any) to minimize the amount of /// data to be moved. // Staged role changes must be merged with nodes roles before calling this function, - // hence it must only be called from apply_staged_changes() and it is not public. + // hence it must only be called from apply_staged_changes() and hence is not public. fn calculate_partition_assignation(&mut self) -> Result { - //The nodes might have been updated, some might have been deleted. - //So we need to first update the list of nodes and retrieve the - //assignation. - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with the new ids + //changes in the layout. We retrieve the old_assignation reframed with new ids let old_assignation_opt = self.update_node_id_vec()?; + //We update the parameters self.parameters = self.staged_parameters.get().clone(); let mut msg = Message::new(); @@ -420,14 +424,14 @@ impl ClusterLayout { //We generate for once numerical ids for the zones of non gateway nodes, //to use them as indices in the flow graphs. - let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_useful_nodes = self.useful_nodes().len(); - if nb_useful_nodes < self.replication_factor { + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_useful_nodes, self.replication_factor + nb_nongateway_nodes, self.replication_factor ))); } if id_to_zone.len() < self.parameters.zone_redundancy { @@ -457,6 +461,7 @@ impl ClusterLayout { partition_size )); } + //We write the partition size. self.partition_size = partition_size; if partition_size < 100 { @@ -467,14 +472,15 @@ impl ClusterLayout { ); } - //We compute a first flow/assignment that is heuristically close to the previous - //assignment - let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignation_opt)?; + //We compute a first flow/assignation that is heuristically close to the previous + //assignation + let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignment. + //We minimize the distance to the previous assignation. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } + //We display statistics of the computation msg.append(&mut self.output_stat( &gflow, &old_assignation_opt, @@ -538,14 +544,13 @@ impl ClusterLayout { // (2) We retrieve the old association //We rewrite the old association with the new indices. We only consider partition //to node assignations where the node is still in use. - let nb_partitions = 1usize << PARTITION_BITS; - let mut old_assignation = vec![Vec::::new(); nb_partitions]; + let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; if self.ring_assignation_data.is_empty() { //This is a new association return Ok(None); } - if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "The old assignation does not have a size corresponding to \ the old replication factor or the number of partitions." @@ -580,11 +585,11 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - for uuid in self.useful_nodes().iter() { + for uuid in self.nongateway_nodes().iter() { if self.roles.get(uuid) == None { return Err(Error::Message( "The uuid was not found in the node roles (this should \ @@ -603,17 +608,16 @@ impl ClusterLayout { } ///This function computes by dichotomy the largest realizable partition size, given - ///the layout. + ///the layout roles and parameters. fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, ) -> Result { - let nb_partitions = 1usize << PARTITION_BITS; let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? - < (nb_partitions * self.replication_factor) + < (NB_PARTITIONS * self.replication_factor) .try_into() .unwrap() { @@ -630,7 +634,7 @@ impl ClusterLayout { g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? - < (nb_partitions * self.replication_factor) + < (NB_PARTITIONS * self.replication_factor) .try_into() .unwrap() { @@ -658,14 +662,21 @@ impl ClusterLayout { vertices } + ///Generates the graph to compute the maximal flow corresponding to the optimal + ///partition assignation. + ///exclude_assoc is the set of (partition, node) association that we are forbidden + ///to use (hence we do not add the corresponding edge to the graph). This parameter + ///is used to compute a first flow that uses only edges appearing in the previous + ///assignation. This produces a solution that heuristically should be close to the + ///previous one. fn generate_flow_graph( &self, - size: u32, + partition_size: u32, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, ) -> Result, Error> { let vertices = - ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); let redundancy = self.parameters.zone_redundancy; @@ -685,10 +696,10 @@ impl ClusterLayout { )?; } } - for n in 0..self.useful_nodes().len() { + for n in 0..self.nongateway_nodes().len() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / size)?; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; @@ -698,28 +709,34 @@ impl ClusterLayout { Ok(g) } - fn compute_candidate_assignment( + ///This function computes a first optimal assignation (in the form of a flow graph). + fn compute_candidate_assignation( &self, zone_to_id: &HashMap, - old_assoc_opt: &Option>>, + prev_assign_opt: &Option>>, ) -> Result, Error> { - //We list the edges that are not used in the old association + //We list the (partition,node) associations that are not used in the + //previous assignation let mut exclude_edge = HashSet::<(usize, usize)>::new(); - if let Some(old_assoc) = old_assoc_opt { - let nb_nodes = self.useful_nodes().len(); - for (p, old_assoc_p) in old_assoc.iter().enumerate() { + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); } - for n in old_assoc_p.iter() { + for n in prev_assign_p.iter() { exclude_edge.remove(&(p, *n)); } } } - //We compute the best flow using only the edges used in the old assoc + //We compute the best flow using only the edges used in the previous assignation let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; + + //We add the excluded edges and compute the maximal flow with the full graph. + //The algorithm is such that it will start with the flow that we just computed + //and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; @@ -728,26 +745,35 @@ impl ClusterLayout { Ok(g) } + ///This function updates the flow graph gflow to minimize the distance between + ///its corresponding assignation and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, zone_to_id: &HashMap, - old_assoc: &[Vec], + prev_assign: &[Vec], ) -> Result<(), Error> { + //We define a cost function on the edges (pairs of vertices) corresponding + //to the distance between the two assignations. let mut cost = CostFunction::new(); - for (p, assoc_p) in old_assoc.iter().enumerate() { + for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } - let nb_nodes = self.useful_nodes().len(); + + //We compute the maximal length of a simple path in gflow. It is used in the + //Bellman-Ford algorithm in optimize_flow_with_cost to set the number + //of iterations. + let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; Ok(()) } + ///This function updates the assignation ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, @@ -775,19 +801,18 @@ impl ClusterLayout { Ok(()) } - //This function returns a message summing up the partition repartition of the new - //layout. + ///This function returns a message summing up the partition repartition of the new + ///layout, and other statistics of the partition assignation computation. fn output_stat( &self, gflow: &Graph, - old_assoc_opt: &Option>>, + prev_assign_opt: &Option>>, zone_to_id: &HashMap, id_to_zone: &[String], ) -> Result { let mut msg = Message::new(); - let nb_partitions = 1usize << PARTITION_BITS; - let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; + let used_cap = self.partition_size * NB_PARTITIONS as u32 * self.replication_factor as u32; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push("".into()); @@ -813,21 +838,21 @@ impl ClusterLayout { )); //We define and fill in the following tables - let storing_nodes = self.useful_nodes(); + let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; let mut new_partitions_zone = vec![0; id_to_zone.len()]; let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - for p in 0..nb_partitions { + for p in 0..NB_PARTITIONS { for z in 0..id_to_zone.len() { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; if !pz_nodes.is_empty() { stored_partitions_zone[z] += 1; - if let Some(old_assoc) = old_assoc_opt { + if let Some(prev_assign) = prev_assign_opt { let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { + for n in prev_assign[p].iter() { old_zones_of_p .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); } @@ -839,8 +864,8 @@ impl ClusterLayout { for vert in pz_nodes.iter() { if let Vertex::N(n) = *vert { stored_partitions[n] += 1; - if let Some(old_assoc) = old_assoc_opt { - if !old_assoc[p].contains(&n) { + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { new_partitions[n] += 1; } } @@ -849,7 +874,7 @@ impl ClusterLayout { } } - if *old_assoc_opt == None { + if *prev_assign_opt == None { new_partitions = stored_partitions.clone(); new_partitions_zone = stored_partitions_zone.clone(); } @@ -857,7 +882,7 @@ impl ClusterLayout { //We display the statistics msg.push("".into()); - if *old_assoc_opt != None { + if *prev_assign_opt != None { let total_new_partitions: usize = new_partitions.iter().sum(); msg.push(format!( "A total of {} new copies of partitions need to be \ @@ -950,9 +975,8 @@ mod tests { fn check_against_naive(cl: &ClusterLayout) -> Result { let over_size = cl.partition_size + 1; let mut zone_token = HashMap::::new(); - let nb_partitions = 1usize << PARTITION_BITS; - let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; if zones.is_empty() { return Ok(false); @@ -961,12 +985,12 @@ mod tests { for z in zones.iter() { zone_token.insert(z.clone(), 0); } - for uuid in cl.useful_nodes().iter() { + for uuid in cl.nongateway_nodes().iter() { let z = cl.get_node_zone(uuid)?; let c = cl.get_node_capacity(uuid)?; zone_token.insert( z.clone(), - zone_token[&z] + min(nb_partitions, (c / over_size) as usize), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), ); } @@ -978,15 +1002,15 @@ mod tests { id_zone_token[zone_to_id[z]] = *t; } - let mut nb_token = vec![0; nb_partitions]; - let mut last_zone = vec![zones.len(); nb_partitions]; + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; let mut curr_zone = 0; let redundancy = cl.parameters.zone_redundancy; for replic in 0..cl.replication_factor { - for p in 0..nb_partitions { + for p in 0..NB_PARTITIONS { while id_zone_token[curr_zone] == 0 || (last_zone[p] == curr_zone && redundancy - nb_token[p] <= cl.replication_factor - replic) -- cgit v1.2.3 From ea5afc251106b3f6e2d07f942ba1f88abeef8765 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 19:34:40 +0100 Subject: Style improvements --- src/api/admin/cluster.rs | 6 +- src/garage/cli/cmd.rs | 2 +- src/garage/cli/layout.rs | 20 ++-- src/rpc/graph_algo.rs | 273 +++++++++++++++++++++++------------------------ src/rpc/layout.rs | 247 +++++++++++++++++++++--------------------- 5 files changed, 271 insertions(+), 277 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 61bfb8c5..040778b1 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -86,7 +86,7 @@ fn get_cluster_layout(garage: &Arc) -> GetClusterLayoutResponse { .map(|(k, _, v)| (hex::encode(k), v.0.clone())) .collect(), staged_role_changes: layout - .staging + .staging_roles .items() .iter() .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) @@ -137,14 +137,14 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.get_cluster_layout(); let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); for (node, role) in updates { let node = hex::decode(node).ok_or_bad_request("Invalid node identifier")?; let node = Uuid::try_from(&node).ok_or_bad_request("Invalid node identifier")?; layout - .staging + .staging_roles .merge(&roles.update_mutator(node, NodeRoleV(role))); } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c8b96489..e352ddf2 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -71,7 +71,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } _ => { - let new_role = match layout.staging.get(&adv.id) { + let new_role = match layout.staging_roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "(pending)", _ => "NO ROLE ASSIGNED", }; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 5056e57d..4b23a096 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -63,14 +63,14 @@ pub async fn cmd_assign_role( .collect::, _>>()?; let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); for replaced in args.replace.iter() { let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging + .staging_roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -128,7 +128,7 @@ pub async fn cmd_assign_role( }; layout - .staging + .staging_roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -148,13 +148,13 @@ pub async fn cmd_remove_role( let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging + .staging_roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -278,7 +278,7 @@ pub async fn cmd_config_layout( println!("The zone redundancy must be at least 1."); } else { layout - .staged_parameters + .staging_parameters .update(LayoutParameters { zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } @@ -352,13 +352,13 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { - let has_changes = layout.staged_parameters.get().clone() != layout.parameters; + let has_changes = layout.staging_parameters.get().clone() != layout.parameters; if has_changes { println!(); println!("==== NEW LAYOUT PARAMETERS ===="); println!( "Zone redundancy: {}", - layout.staged_parameters.get().zone_redundancy + layout.staging_parameters.get().zone_redundancy ); println!(); } @@ -367,7 +367,7 @@ pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { let has_changes = layout - .staging + .staging_roles .items() .iter() .any(|(k, _, v)| layout.roles.get(k) != Some(v)); @@ -376,7 +376,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { println!(); println!("==== STAGED ROLE CHANGES ===="); let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging.items().iter() { + for (id, _, role) in layout.staging_roles.items().iter() { if layout.roles.get(id) == Some(role) { continue; } diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 5bd6cc51..1e4a819b 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -6,33 +6,33 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::collections::VecDeque; -///Vertex data structures used in all the graphs used in layout.rs. -///usize parameters correspond to node/zone/partitions ids. -///To understand the vertex roles below, please refer to the formal description -///of the layout computation algorithm. +/// Vertex data structures used in all the graphs used in layout.rs. +/// usize parameters correspond to node/zone/partitions ids. +/// To understand the vertex roles below, please refer to the formal description +/// of the layout computation algorithm. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum Vertex { Source, - Pup(usize), //The vertex p+ of partition p - Pdown(usize), //The vertex p- of partition p - PZ(usize, usize), //The vertex corresponding to x_(partition p, zone z) - N(usize), //The vertex corresponding to node n + Pup(usize), // The vertex p+ of partition p + Pdown(usize), // The vertex p- of partition p + PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) + N(usize), // The vertex corresponding to node n Sink, } -///Edge data structure for the flow algorithm. +/// Edge data structure for the flow algorithm. #[derive(Clone, Copy, Debug)] pub struct FlowEdge { - cap: u32, //flow maximal capacity of the edge - flow: i32, //flow value on the edge - dest: usize, //destination vertex id - rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v + cap: u32, // flow maximal capacity of the edge + flow: i32, // flow value on the edge + dest: usize, // destination vertex id + rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v } -///Edge data structure for the detection of negative cycles. +/// Edge data structure for the detection of negative cycles. #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { - w: i32, //weight of the edge + w: i32, // weight of the edge dest: usize, } @@ -40,14 +40,14 @@ pub trait Edge: Clone + Copy {} impl Edge for FlowEdge {} impl Edge for WeightedEdge {} -///Struct for the graph structure. We do encapsulation here to be able to both -///provide user friendly Vertex enum to address vertices, and to use internally usize -///indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. +/// Struct for the graph structure. We do encapsulation here to be able to both +/// provide user friendly Vertex enum to address vertices, and to use internally usize +/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. pub struct Graph { - vertextoid: HashMap, - idtovertex: Vec, + vertex_to_id: HashMap, + id_to_vertex: Vec, - //The graph is stored as an adjacency list + // The graph is stored as an adjacency list graph: Vec>, } @@ -60,22 +60,30 @@ impl Graph { map.insert(*vert, i); } Graph:: { - vertextoid: map, - idtovertex: vertices.to_vec(), + vertex_to_id: map, + id_to_vertex: vertices.to_vec(), graph: vec![Vec::::new(); vertices.len()], } } + + fn get_vertex_id(&self, v: &Vertex) -> Result { + self.vertex_to_id + .get(v) + .cloned() + .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) + } } impl Graph { - ///This function adds a directed edge to the graph with capacity c, and the - ///corresponding reversed edge with capacity 0. + /// This function adds a directed edge to the graph with capacity c, and the + /// corresponding reversed edge with capacity 0. pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + if idu == idv { + return Err("Cannot add edge from vertex to itself in flow graph".into()); } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; + let rev_u = self.graph[idu].len(); let rev_v = self.graph[idv].len(); self.graph[idu].push(FlowEdge { @@ -93,28 +101,22 @@ impl Graph { Ok(()) } - ///This function returns the list of vertices that receive a positive flow from - ///vertex v. + /// This function returns the list of vertices that receive a positive flow from + /// vertex v. pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; + let idv = self.get_vertex_id(&v)?; let mut result = Vec::::new(); for edge in self.graph[idv].iter() { if edge.flow > 0 { - result.push(self.idtovertex[edge.dest]); + result.push(self.id_to_vertex[edge.dest]); } } Ok(result) } - ///This function returns the value of the flow incoming to v. + /// This function returns the value of the flow incoming to v. pub fn get_inflow(&self, v: Vertex) -> Result { - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; + let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { result += max(0, self.graph[edge.dest][edge.rev].flow); @@ -122,12 +124,9 @@ impl Graph { Ok(result) } - ///This function returns the value of the flow outgoing from v. + /// This function returns the value of the flow outgoing from v. pub fn get_outflow(&self, v: Vertex) -> Result { - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; + let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { result += max(0, edge.flow); @@ -135,19 +134,19 @@ impl Graph { Ok(result) } - ///This function computes the flow total value by computing the outgoing flow - ///from the source. + /// This function computes the flow total value by computing the outgoing flow + /// from the source. pub fn get_flow_value(&mut self) -> Result { self.get_outflow(Vertex::Source) } - ///This function shuffles the order of the edge lists. It keeps the ids of the - ///reversed edges consistent. + /// This function shuffles the order of the edge lists. It keeps the ids of the + /// reversed edges consistent. fn shuffle_edges(&mut self) { let mut rng = rand::thread_rng(); for i in 0..self.graph.len() { self.graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. + // We need to update the ids of the reverse edges. for j in 0..self.graph[i].len() { let target_v = self.graph[i][j].dest; let target_rev = self.graph[i][j].rev; @@ -156,97 +155,86 @@ impl Graph { } } - ///Computes an upper bound of the flow on the graph - pub fn flow_upper_bound(&self) -> u32 { - let idsource = self.vertextoid[&Vertex::Source]; + /// Computes an upper bound of the flow on the graph + pub fn flow_upper_bound(&self) -> Result { + let idsource = self.get_vertex_id(&Vertex::Source)?; let mut flow_upper_bound = 0; for edge in self.graph[idsource].iter() { flow_upper_bound += edge.cap; } - flow_upper_bound + Ok(flow_upper_bound) } - ///This function computes the maximal flow using Dinic's algorithm. It starts with - ///the flow values already present in the graph. So it is possible to add some edge to - ///the graph, compute a flow, add other edges, update the flow. + /// This function computes the maximal flow using Dinic's algorithm. It starts with + /// the flow values already present in the graph. So it is possible to add some edge to + /// the graph, compute a flow, add other edges, update the flow. pub fn compute_maximal_flow(&mut self) -> Result<(), String> { - if !self.vertextoid.contains_key(&Vertex::Source) { - return Err("The graph does not contain a source.".to_string()); - } - if !self.vertextoid.contains_key(&Vertex::Sink) { - return Err("The graph does not contain a sink.".to_string()); - } - - let idsource = self.vertextoid[&Vertex::Source]; - let idsink = self.vertextoid[&Vertex::Sink]; + let idsource = self.get_vertex_id(&Vertex::Source)?; + let idsink = self.get_vertex_id(&Vertex::Sink)?; let nb_vertices = self.graph.len(); - let flow_upper_bound = self.flow_upper_bound(); + let flow_upper_bound = self.flow_upper_bound()?; - //To ensure the dispersion of the associations generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //the vertices do not consider their neighbours in the same order. + // To ensure the dispersion of the associations generated by the + // assignation, we shuffle the neighbours of the nodes. Hence, + // the vertices do not consider their neighbours in the same order. self.shuffle_edges(); - //We run Dinic's max flow algorithm + // We run Dinic's max flow algorithm loop { - //We build the level array from Dinic's algorithm. + // We build the level array from Dinic's algorithm. let mut level = vec![None; nb_vertices]; let mut fifo = VecDeque::new(); fifo.push_back((idsource, 0)); - while !fifo.is_empty() { - if let Some((id, lvl)) = fifo.pop_front() { - if level[id] == None { - //it means id has not yet been reached - level[id] = Some(lvl); - for edge in self.graph[id].iter() { - if edge.cap as i32 - edge.flow > 0 { - fifo.push_back((edge.dest, lvl + 1)); - } + while let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { + // it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i32 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); } } } } if level[idsink] == None { - //There is no residual flow + // There is no residual flow break; } - //Now we run DFS respecting the level array + // Now we run DFS respecting the level array let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); + let mut lifo = Vec::new(); - lifo.push_back((idsource, flow_upper_bound)); + lifo.push((idsource, flow_upper_bound)); - while let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; + while let Some((id, f)) = lifo.last().cloned() { if id == idsink { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while let Some((id, _)) = lifo.pop_back() { + // The DFS reached the sink, we can add a + // residual flow. + lifo.pop(); + while let Some((id, _)) = lifo.pop() { let nbd = next_nbd[id]; self.graph[id][nbd].flow += f as i32; let id_rev = self.graph[id][nbd].dest; let nbd_rev = self.graph[id][nbd].rev; self.graph[id_rev][nbd_rev].flow -= f as i32; } - lifo.push_back((idsource, flow_upper_bound)); + lifo.push((idsource, flow_upper_bound)); continue; } - //else we did not reach the sink + // else we did not reach the sink let nbd = next_nbd[id]; if nbd >= self.graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { + // There is nothing to explore from id anymore + lifo.pop(); + if let Some((parent, _)) = lifo.last() { next_nbd[*parent] += 1; } continue; } - //else we can try to send flow from id to its nbd + // else we can try to send flow from id to its nbd let new_flow = min( f as i32, self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow, @@ -257,19 +245,19 @@ impl Graph { } if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { if lvldest <= lvlid { - //We cannot send flow to nbd. + // We cannot send flow to nbd. next_nbd[id] += 1; continue; } } - //otherwise, we send flow to nbd. - lifo.push_back((self.graph[id][nbd].dest, new_flow)); + // otherwise, we send flow to nbd. + lifo.push((self.graph[id][nbd].dest, new_flow)); } } Ok(()) } - ///This function takes a flow, and a cost function on the edges, and tries to find an + /// This function takes a flow, and a cost function on the edges, and tries to find an /// equivalent flow with a better cost, by finding improving overflow cycles. It uses /// as subroutine the Bellman Ford algorithm run up to path_length. /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and @@ -279,19 +267,19 @@ impl Graph { cost: &CostFunction, path_length: usize, ) -> Result<(), String> { - //We build the weighted graph g where we will look for negative cycle + // We build the weighted graph g where we will look for negative cycle let mut gf = self.build_cost_graph(cost)?; let mut cycles = gf.list_negative_cycles(path_length); while !cycles.is_empty() { - //we enumerate negative cycles + // we enumerate negative cycles for c in cycles.iter() { for i in 0..c.len() { - //We add one flow unit to the edge (u,v) of cycle c - let idu = self.vertextoid[&c[i]]; - let idv = self.vertextoid[&c[(i + 1) % c.len()]]; + // We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertex_to_id[&c[i]]; + let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; for j in 0..self.graph[idu].len() { - //since idu appears at most once in the cycles, we enumerate every - //edge at most once. + // since idu appears at most once in the cycles, we enumerate every + // edge at most once. let edge = self.graph[idu][j]; if edge.dest == idv { self.graph[idu][j].flow += 1; @@ -308,16 +296,16 @@ impl Graph { Ok(()) } - ///Construct the weighted graph G_f from the flow and the cost function + /// Construct the weighted graph G_f from the flow and the cost function fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { - let mut g = Graph::::new(&self.idtovertex); - let nb_vertices = self.idtovertex.len(); + let mut g = Graph::::new(&self.id_to_vertex); + let nb_vertices = self.id_to_vertex.len(); for i in 0..nb_vertices { for edge in self.graph[i].iter() { if edge.cap as i32 - edge.flow > 0 { - //It is possible to send overflow through this edge - let u = self.idtovertex[i]; - let v = self.idtovertex[edge.dest]; + // It is possible to send overflow through this edge + let u = self.id_to_vertex[i]; + let v = self.id_to_vertex[edge.dest]; if cost.contains_key(&(u, v)) { g.add_edge(u, v, cost[&(u, v)])?; } else if cost.contains_key(&(v, u)) { @@ -333,29 +321,26 @@ impl Graph { } impl Graph { - ///This function adds a single directed weighted edge to the graph. + /// This function adds a single directed weighted edge to the graph. pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; self.graph[idu].push(WeightedEdge { w, dest: idv }); Ok(()) } - ///This function lists the negative cycles it manages to find after path_length - ///iterations of the main loop of the Bellman-Ford algorithm. For the classical - ///algorithm, path_length needs to be equal to the number of vertices. However, - ///for particular graph structures like in our case, the algorithm is still correct - ///when path_length is the length of the longest possible simple path. - ///See the formal description of the algorithm for more details. + /// This function lists the negative cycles it manages to find after path_length + /// iterations of the main loop of the Bellman-Ford algorithm. For the classical + /// algorithm, path_length needs to be equal to the number of vertices. However, + /// for particular graph structures like in our case, the algorithm is still correct + /// when path_length is the length of the longest possible simple path. + /// See the formal description of the algorithm for more details. fn list_negative_cycles(&self, path_length: usize) -> Vec> { let nb_vertices = self.graph.len(); - //We start with every vertex at distance 0 of some imaginary extra -1 vertex. + // We start with every vertex at distance 0 of some imaginary extra -1 vertex. let mut distance = vec![0; nb_vertices]; - //The prev vector collects for every vertex from where does the shortest path come + // The prev vector collects for every vertex from where does the shortest path come let mut prev = vec![None; nb_vertices]; for _ in 0..path_length + 1 { @@ -369,29 +354,35 @@ impl Graph { } } - //If self.graph contains a negative cycle, then at this point the graph described - //by prev (which is a directed 1-forest/functional graph) - //must contain a cycle. We list the cycles of prev. + // If self.graph contains a negative cycle, then at this point the graph described + // by prev (which is a directed 1-forest/functional graph) + // must contain a cycle. We list the cycles of prev. let cycles_prev = cycles_of_1_forest(&prev); - //Remark that the cycle in prev is in the reverse order compared to the cycle - //in the graph. Thus the .rev(). + // Remark that the cycle in prev is in the reverse order compared to the cycle + // in the graph. Thus the .rev(). return cycles_prev .iter() - .map(|cycle| cycle.iter().rev().map(|id| self.idtovertex[*id]).collect()) + .map(|cycle| { + cycle + .iter() + .rev() + .map(|id| self.id_to_vertex[*id]) + .collect() + }) .collect(); } } -///This function returns the list of cycles of a directed 1 forest. It does not -///check for the consistency of the input. +/// This function returns the list of cycles of a directed 1 forest. It does not +/// check for the consistency of the input. fn cycles_of_1_forest(forest: &[Option]) -> Vec> { let mut cycles = Vec::>::new(); let mut time_of_discovery = vec![None; forest.len()]; for t in 0..forest.len() { let mut id = t; - //while we are on a valid undiscovered node + // while we are on a valid undiscovered node while time_of_discovery[id] == None { time_of_discovery[id] = Some(t); if let Some(i) = forest[id] { @@ -401,8 +392,8 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { } } if forest[id] != None && time_of_discovery[id] == Some(t) { - //We discovered an id that we explored at this iteration t. - //It means we are on a cycle + // We discovered an id that we explored at this iteration t. + // It means we are on a cycle let mut cy = vec![id; 1]; let mut id2 = id; while let Some(id_next) = forest[id2] { diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 38e56b88..95f69dc8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -19,7 +19,7 @@ use std::convert::TryInto; const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; -//The Message type will be used to collect information on the algorithm. +// The Message type will be used to collect information on the algorithm. type Message = Vec; /// The layout of the cluster, i.e. the list of roles @@ -30,11 +30,11 @@ pub struct ClusterLayout { pub replication_factor: usize, - ///This attribute is only used to retain the previously computed partition size, - ///to know to what extent does it change with the layout update. + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. pub partition_size: u32, - ///Parameters used to compute the assignation currently given by - ///ring_assignation_data + /// Parameters used to compute the assignation currently given by + /// ring_assignation_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -53,14 +53,14 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Parameters to be used in the next partition assignation computation. - pub staged_parameters: Lww, + pub staging_parameters: Lww, /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, + pub staging_roles: LwwMap, pub staging_hash: Hash, } -///This struct is used to set the parameters to be used in the assignation computation -///algorithm. It is stored as a Crdt. +/// This struct is used to set the parameters to be used in the assignation computation +/// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { pub zone_redundancy: usize, @@ -114,20 +114,19 @@ impl NodeRole { } } -//Implementation of the ClusterLayout methods unrelated to the assignation algorithm. +// Implementation of the ClusterLayout methods unrelated to the assignation algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - //We set the default zone redundancy to be equal to the replication factor, - //i.e. as strict as possible. + // We set the default zone redundancy to be equal to the replication factor, + // i.e. as strict as possible. let parameters = LayoutParameters { zone_redundancy: replication_factor, }; - let staged_parameters = Lww::::new(parameters.clone()); + let staging_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); - let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); - ClusterLayout { + let mut ret = ClusterLayout { version: 0, replication_factor, partition_size: 0, @@ -135,10 +134,17 @@ impl ClusterLayout { node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), parameters, - staged_parameters, - staging: empty_lwwmap, - staging_hash: empty_lwwmap_hash, - } + staging_parameters, + staging_roles: empty_lwwmap, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + + fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&rmp_to_vec_all_named(&hashed_tuple).unwrap()[..]) } pub fn merge(&mut self, other: &ClusterLayout) -> bool { @@ -148,16 +154,15 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); - self.staged_parameters.merge(&other.staged_parameters); - self.staging.merge(&other.staging); + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); - let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - let stage_changed = new_staging_hash != self.staging_hash; + let new_staging_hash = self.calculate_staging_hash(); + let changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; - stage_changed || param_changed + changed } Ordering::Less => false, } @@ -179,13 +184,14 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.roles.merge(&self.staging); + self.roles.merge(&self.staging_roles); self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = self.staging_parameters.get().clone(); let msg = self.calculate_partition_assignation()?; - self.staging.clear(); - self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -208,9 +214,9 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.staging.clear(); - self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - self.staged_parameters.update(self.parameters.clone()); + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + self.staging_parameters.update(self.parameters.clone()); self.version += 1; @@ -235,7 +241,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. pub fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { @@ -247,7 +253,7 @@ To know the correct value of the new layout version, invoke `garage layout show` result } - ///Given a node uuids, this function returns the label of its zone + /// Given a node uuids, this function returns the label of its zone pub fn get_node_zone(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(role) => Ok(role.zone.clone()), @@ -257,7 +263,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Given a node uuids, this function returns its capacity or fails if it does not have any + /// Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole { @@ -273,7 +279,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the number of partitions associated to this node in the ring + /// Returns the number of partitions associated to this node in the ring pub fn get_node_usage(&self, uuid: &Uuid) -> Result { for (i, id) in self.node_id_vec.iter().enumerate() { if id == uuid { @@ -293,7 +299,7 @@ To know the correct value of the new layout version, invoke `garage layout show` )) } - ///Returns the sum of capacities of non gateway nodes in the cluster + /// Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { @@ -307,7 +313,7 @@ To know the correct value of the new layout version, invoke `garage layout show` /// returns true if consistent, false if error pub fn check(&self) -> bool { // Check that the hash of the staging data is correct - let staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { return false; } @@ -346,14 +352,14 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that every partition is associated to distinct nodes + // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); if nodes_of_p.iter().unique().count() != rf { return false; } - //Check that every partition is spread over at least zone_redundancy zones. + // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p.iter().map(|n| { self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.") @@ -364,7 +370,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that the nodes capacities is consistent with the stored partitions + // Check that the nodes capacities is consistent with the stored partitions let mut node_usage = vec![0; MAX_NODE_NUMBER]; for n in self.ring_assignation_data.iter() { node_usage[*n as usize] += 1; @@ -380,8 +386,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that the partition size stored is the one computed by the asignation - //algorithm. + // Check that the partition size stored is the one computed by the asignation + // algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); match cl2.compute_optimal_partition_size(&zone_to_id) { @@ -394,7 +400,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } -//Implementation of the ClusterLayout methods related to the assignation algorithm. +// Implementation of the ClusterLayout methods related to the assignation algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor @@ -403,16 +409,13 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - // Staged role changes must be merged with nodes roles before calling this function, - // hence it must only be called from apply_staged_changes() and hence is not public. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. fn calculate_partition_assignation(&mut self) -> Result { - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with new ids + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignation reframed with new ids let old_assignation_opt = self.update_node_id_vec()?; - //We update the parameters - self.parameters = self.staged_parameters.get().clone(); - let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("".into()); @@ -422,8 +425,8 @@ impl ClusterLayout { self.replication_factor, self.parameters.zone_redundancy )); - //We generate for once numerical ids for the zones of non gateway nodes, - //to use them as indices in the flow graphs. + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; let nb_nongateway_nodes = self.nongateway_nodes().len(); @@ -443,10 +446,10 @@ impl ClusterLayout { ))); } - //We compute the optimal partition size - //Capacities should be given in a unit so that partition size is at least 100. - //In this case, integer rounding plays a marginal role in the percentages of - //optimality. + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; if old_assignation_opt != None { @@ -461,7 +464,7 @@ impl ClusterLayout { partition_size )); } - //We write the partition size. + // We write the partition size. self.partition_size = partition_size; if partition_size < 100 { @@ -472,15 +475,15 @@ impl ClusterLayout { ); } - //We compute a first flow/assignation that is heuristically close to the previous - //assignation + // We compute a first flow/assignation that is heuristically close to the previous + // assignation let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignation. + // We minimize the distance to the previous assignation. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } - //We display statistics of the computation + // We display statistics of the computation msg.append(&mut self.output_stat( &gflow, &old_assignation_opt, @@ -489,7 +492,7 @@ impl ClusterLayout { )?); msg.push("".to_string()); - //We update the layout structure + // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; if !self.check() { @@ -508,8 +511,8 @@ impl ClusterLayout { /// do modify assignation_ring and node_id_vec. fn update_node_id_vec(&mut self) -> Result>>, Error> { // (1) We compute the new node list - //Non gateway nodes should be coded on 8bits, hence they must be first in the list - //We build the new node ids + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids let mut new_non_gateway_nodes: Vec = self .roles .items() @@ -542,12 +545,12 @@ impl ClusterLayout { self.node_id_vec = new_node_id_vec.clone(); // (2) We retrieve the old association - //We rewrite the old association with the new indices. We only consider partition - //to node assignations where the node is still in use. + // We rewrite the old association with the new indices. We only consider partition + // to node assignations where the node is still in use. let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; if self.ring_assignation_data.is_empty() { - //This is a new association + // This is a new association return Ok(None); } if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { @@ -558,11 +561,11 @@ impl ClusterLayout { )); } - //We build a translation table between the uuid and new ids + // We build a translation table between the uuid and new ids let mut uuid_to_new_id = HashMap::::new(); - //We add the indices of only the new non-gateway nodes that can be used in the - //association ring + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring for (i, uuid) in new_node_id_vec.iter().enumerate() { uuid_to_new_id.insert(*uuid, i); } @@ -577,14 +580,14 @@ impl ClusterLayout { } } - //We write the ring + // We write the ring self.ring_assignation_data = Vec::::new(); Ok(Some(old_assignation)) } - ///This function generates ids for the zone of the nodes appearing in - ///self.node_id_vec. + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); @@ -607,8 +610,8 @@ impl ClusterLayout { Ok((id_to_zone, zone_to_id)) } - ///This function computes by dichotomy the largest realizable partition size, given - ///the layout roles and parameters. + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, @@ -662,13 +665,13 @@ impl ClusterLayout { vertices } - ///Generates the graph to compute the maximal flow corresponding to the optimal - ///partition assignation. - ///exclude_assoc is the set of (partition, node) association that we are forbidden - ///to use (hence we do not add the corresponding edge to the graph). This parameter - ///is used to compute a first flow that uses only edges appearing in the previous - ///assignation. This produces a solution that heuristically should be close to the - ///previous one. + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignation. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignation. This produces a solution that heuristically should be close to the + /// previous one. fn generate_flow_graph( &self, partition_size: u32, @@ -709,14 +712,14 @@ impl ClusterLayout { Ok(g) } - ///This function computes a first optimal assignation (in the form of a flow graph). + /// This function computes a first optimal assignation (in the form of a flow graph). fn compute_candidate_assignation( &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, ) -> Result, Error> { - //We list the (partition,node) associations that are not used in the - //previous assignation + // We list the (partition,node) associations that are not used in the + // previous assignation let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { let nb_nodes = self.nongateway_nodes().len(); @@ -730,13 +733,13 @@ impl ClusterLayout { } } - //We compute the best flow using only the edges used in the previous assignation + // We compute the best flow using only the edges used in the previous assignation let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; - //We add the excluded edges and compute the maximal flow with the full graph. - //The algorithm is such that it will start with the flow that we just computed - //and find ameliorating paths from that. + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; @@ -745,16 +748,16 @@ impl ClusterLayout { Ok(g) } - ///This function updates the flow graph gflow to minimize the distance between - ///its corresponding assignation and the previous one + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignation and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, zone_to_id: &HashMap, prev_assign: &[Vec], ) -> Result<(), Error> { - //We define a cost function on the edges (pairs of vertices) corresponding - //to the distance between the two assignations. + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignations. let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { @@ -763,9 +766,9 @@ impl ClusterLayout { } } - //We compute the maximal length of a simple path in gflow. It is used in the - //Bellman-Ford algorithm in optimize_flow_with_cost to set the number - //of iterations. + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -773,7 +776,7 @@ impl ClusterLayout { Ok(()) } - ///This function updates the assignation ring from the flow graph. + /// This function updates the assignation ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, @@ -801,8 +804,8 @@ impl ClusterLayout { Ok(()) } - ///This function returns a message summing up the partition repartition of the new - ///layout, and other statistics of the partition assignation computation. + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignation computation. fn output_stat( &self, gflow: &Graph, @@ -837,7 +840,7 @@ impl ClusterLayout { used_cap / self.replication_factor as u32 )); - //We define and fill in the following tables + // We define and fill in the following tables let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -879,7 +882,7 @@ impl ClusterLayout { new_partitions_zone = stored_partitions_zone.clone(); } - //We display the statistics + // We display the statistics msg.push("".into()); if *prev_assign_opt != None { @@ -951,27 +954,27 @@ impl ClusterLayout { } } -//==================================================================================== +// ==================================================================================== #[cfg(test)] mod tests { use super::{Error, *}; use std::cmp::min; - //This function checks that the partition size S computed is at least better than the - //one given by a very naive algorithm. To do so, we try to run the naive algorithm - //assuming a partion size of S+1. If we succed, it means that the optimal assignation - //was not optimal. The naive algorithm is the following : - //- we compute the max number of partitions associated to every node, capped at the - //partition number. It gives the number of tokens of every node. - //- every zone has a number of tokens equal to the sum of the tokens of its nodes. - //- we cycle over the partitions and associate zone tokens while respecting the - //zone redundancy constraint. - //NOTE: the naive algorithm is not optimal. Counter example: - //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - //With these parameters, the naive algo fails, whereas there is a solution: - //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignation + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) fn check_against_naive(cl: &ClusterLayout) -> Result { let over_size = cl.partition_size + 1; let mut zone_token = HashMap::::new(); @@ -994,8 +997,8 @@ mod tests { ); } - //For every partition, we count the number of zone already associated and - //the name of the last zone associated + // For every partition, we count the number of zone already associated and + // the name of the last zone associated let mut id_zone_token = vec![0; zones.len()]; for (z, t) in zone_token.iter() { @@ -1049,7 +1052,7 @@ mod tests { cl.node_id_vec.push(x); } - let update = cl.staging.update_mutator( + let update = cl.staging_roles.update_mutator( cl.node_id_vec[i], NodeRoleV(Some(NodeRole { zone: (node_zone_vec[i].to_string()), @@ -1057,10 +1060,10 @@ mod tests { tags: (vec![]), })), ); - cl.staging.merge(&update); + cl.staging_roles.merge(&update); } - cl.staging_hash = blake2sum(&rmp_to_vec_all_named(&cl.staging).unwrap()[..]); - cl.staged_parameters + cl.staging_hash = cl.calculate_staging_hash(); + cl.staging_parameters .update(LayoutParameters { zone_redundancy }); } -- cgit v1.2.3 From fd5bc142b553d716c8265d83cff0bb633aa09e6b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 20:29:25 +0100 Subject: Ensure .sort() is called before counting unique items --- src/rpc/layout.rs | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'src') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 95f69dc8..15765662 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -355,17 +355,22 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + nodes_of_p.sort(); if nodes_of_p.iter().unique().count() != rf { return false; } // Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p.iter().map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }); + let mut zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::>(); + zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; - if zones_of_p.unique().count() < redundancy { + if zones_of_p.iter().unique().count() < redundancy { return false; } } @@ -378,9 +383,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for (n, usage) in node_usage.iter().enumerate() { if *usage > 0 { let uuid = self.node_id_vec[n]; - if usage * self.partition_size - > self.get_node_capacity(&uuid).expect("Critical Error") - { + if usage * self.partition_size > self.get_node_capacity(&uuid).unwrap() { return false; } } @@ -389,7 +392,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that the partition size stored is the one computed by the asignation // algorithm. let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); match cl2.compute_optimal_partition_size(&zone_to_id) { Ok(s) if s != self.partition_size => return false, Err(_) => return false, @@ -484,12 +487,7 @@ impl ClusterLayout { } // We display statistics of the computation - msg.append(&mut self.output_stat( - &gflow, - &old_assignation_opt, - &zone_to_id, - &id_to_zone, - )?); + msg.extend(self.output_stat(&gflow, &old_assignation_opt, &zone_to_id, &id_to_zone)?); msg.push("".to_string()); // We update the layout structure -- cgit v1.2.3 From 73a4ca8b1515f95bf7860fc292c12db83d3c6228 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 21:12:11 +0100 Subject: Use bytes as capacity units --- src/garage/cli/layout.rs | 18 +++++++++++--- src/garage/cli/structs.rs | 4 +-- src/rpc/Cargo.toml | 1 + src/rpc/graph_algo.rs | 34 +++++++++++++------------- src/rpc/layout.rs | 62 ++++++++++++++++++++++++----------------------- 5 files changed, 66 insertions(+), 53 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 4b23a096..85af345a 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,3 +1,5 @@ +use bytesize::ByteSize; + use garage_util::crdt::Crdt; use garage_util::error::*; use garage_util::formater::format_table; @@ -86,7 +88,7 @@ pub async fn cmd_assign_role( return Err(Error::Message( "-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into())); } - if args.capacity == Some(0) { + if args.capacity == Some(ByteSize::b(0)) { return Err(Error::Message("Invalid capacity value: 0".into())); } @@ -94,7 +96,7 @@ pub async fn cmd_assign_role( let new_entry = match roles.get(&added_node) { Some(NodeRoleV(Some(old))) => { let capacity = match args.capacity { - Some(c) => Some(c), + Some(c) => Some(c.as_u64()), None if args.gateway => None, None => old.capacity, }; @@ -111,7 +113,7 @@ pub async fn cmd_assign_role( } _ => { let capacity = match args.capacity { - Some(c) => Some(c), + Some(c) => Some(c.as_u64()), None if args.gateway => None, None => return Err(Error::Message( "Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())), @@ -265,6 +267,7 @@ pub async fn cmd_config_layout( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let mut did_something = false; match config_opt.redundancy { None => (), Some(r) => { @@ -282,9 +285,16 @@ pub async fn cmd_config_layout( .update(LayoutParameters { zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } + did_something = true; } } + if !did_something { + return Err(Error::Message( + "Please specify an action for `garage layout config` to do".into(), + )); + } + send_layout(rpc_cli, rpc_host, layout).await?; Ok(()) } @@ -335,7 +345,7 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { tags, role.zone, role.capacity_string(), - usage as u32 * layout.partition_size, + ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) )); } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 64798952..49a1f267 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -114,9 +114,9 @@ pub struct AssignRoleOpt { #[structopt(short = "z", long = "zone")] pub(crate) zone: Option, - /// Capacity (in relative terms) + /// Storage capacity, in bytes (supported suffixes: B, KB, MB, GB, TB, PB) #[structopt(short = "c", long = "capacity")] - pub(crate) capacity: Option, + pub(crate) capacity: Option, /// Gateway-only node #[structopt(short = "g", long = "gateway")] diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 5a427131..1b411c6a 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -18,6 +18,7 @@ garage_util = { version = "0.8.0", path = "../util" } arc-swap = "1.0" bytes = "1.0" +bytesize = "1.1" gethostname = "0.2" hex = "0.4" tracing = "0.1.30" diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 1e4a819b..f181e2ba 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -23,8 +23,8 @@ pub enum Vertex { /// Edge data structure for the flow algorithm. #[derive(Clone, Copy, Debug)] pub struct FlowEdge { - cap: u32, // flow maximal capacity of the edge - flow: i32, // flow value on the edge + cap: u64, // flow maximal capacity of the edge + flow: i64, // flow value on the edge dest: usize, // destination vertex id rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v } @@ -32,7 +32,7 @@ pub struct FlowEdge { /// Edge data structure for the detection of negative cycles. #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { - w: i32, // weight of the edge + w: i64, // weight of the edge dest: usize, } @@ -51,7 +51,7 @@ pub struct Graph { graph: Vec>, } -pub type CostFunction = HashMap<(Vertex, Vertex), i32>; +pub type CostFunction = HashMap<(Vertex, Vertex), i64>; impl Graph { pub fn new(vertices: &[Vertex]) -> Self { @@ -77,7 +77,7 @@ impl Graph { impl Graph { /// This function adds a directed edge to the graph with capacity c, and the /// corresponding reversed edge with capacity 0. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { let idu = self.get_vertex_id(&u)?; let idv = self.get_vertex_id(&v)?; if idu == idv { @@ -115,7 +115,7 @@ impl Graph { } /// This function returns the value of the flow incoming to v. - pub fn get_inflow(&self, v: Vertex) -> Result { + pub fn get_inflow(&self, v: Vertex) -> Result { let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { @@ -125,7 +125,7 @@ impl Graph { } /// This function returns the value of the flow outgoing from v. - pub fn get_outflow(&self, v: Vertex) -> Result { + pub fn get_outflow(&self, v: Vertex) -> Result { let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { @@ -136,7 +136,7 @@ impl Graph { /// This function computes the flow total value by computing the outgoing flow /// from the source. - pub fn get_flow_value(&mut self) -> Result { + pub fn get_flow_value(&mut self) -> Result { self.get_outflow(Vertex::Source) } @@ -156,7 +156,7 @@ impl Graph { } /// Computes an upper bound of the flow on the graph - pub fn flow_upper_bound(&self) -> Result { + pub fn flow_upper_bound(&self) -> Result { let idsource = self.get_vertex_id(&Vertex::Source)?; let mut flow_upper_bound = 0; for edge in self.graph[idsource].iter() { @@ -193,7 +193,7 @@ impl Graph { // it means id has not yet been reached level[id] = Some(lvl); for edge in self.graph[id].iter() { - if edge.cap as i32 - edge.flow > 0 { + if edge.cap as i64 - edge.flow > 0 { fifo.push_back((edge.dest, lvl + 1)); } } @@ -216,10 +216,10 @@ impl Graph { lifo.pop(); while let Some((id, _)) = lifo.pop() { let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i32; + self.graph[id][nbd].flow += f as i64; let id_rev = self.graph[id][nbd].dest; let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i32; + self.graph[id_rev][nbd_rev].flow -= f as i64; } lifo.push((idsource, flow_upper_bound)); continue; @@ -236,9 +236,9 @@ impl Graph { } // else we can try to send flow from id to its nbd let new_flow = min( - f as i32, - self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow, - ) as u32; + f as i64, + self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, + ) as u64; if new_flow == 0 { next_nbd[id] += 1; continue; @@ -302,7 +302,7 @@ impl Graph { let nb_vertices = self.id_to_vertex.len(); for i in 0..nb_vertices { for edge in self.graph[i].iter() { - if edge.cap as i32 - edge.flow > 0 { + if edge.cap as i64 - edge.flow > 0 { // It is possible to send overflow through this edge let u = self.id_to_vertex[i]; let v = self.id_to_vertex[edge.dest]; @@ -322,7 +322,7 @@ impl Graph { impl Graph { /// This function adds a single directed weighted edge to the graph. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { let idu = self.get_vertex_id(&u)?; let idv = self.get_vertex_id(&v)?; self.graph[idu].push(WeightedEdge { w, dest: idv }); diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 15765662..3c80b213 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; -use hex::ToHex; +use bytesize::ByteSize; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -32,7 +32,7 @@ pub struct ClusterLayout { /// This attribute is only used to retain the previously computed partition size, /// to know to what extent does it change with the layout update. - pub partition_size: u32, + pub partition_size: u64, /// Parameters used to compute the assignation currently given by /// ring_assignation_data pub parameters: LayoutParameters, @@ -86,8 +86,7 @@ pub struct NodeRole { /// The capacity of the node /// If this is set to None, the node does not participate in storing data for the system /// and is only active as an API gateway to other nodes - // TODO : change the capacity to u64 and use byte unit input/output - pub capacity: Option, + pub capacity: Option, /// A set of tags to recognize the node pub tags: Vec, } @@ -95,7 +94,7 @@ pub struct NodeRole { impl NodeRole { pub fn capacity_string(&self) -> String { match self.capacity { - Some(c) => format!("{}", c), + Some(c) => ByteSize::b(c).to_string_as(false), None => "gateway".to_string(), } } @@ -264,7 +263,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), @@ -300,7 +299,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -458,13 +457,14 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!( "Optimal size of a partition: {} (was {} in the previous layout).", - partition_size, self.partition_size + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) )); } else { msg.push(format!( "Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", - partition_size + ByteSize::b(partition_size).to_string_as(false) )); } // We write the partition size. @@ -613,7 +613,7 @@ impl ClusterLayout { fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, - ) -> Result { + ) -> Result { let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; @@ -672,7 +672,7 @@ impl ClusterLayout { /// previous one. fn generate_flow_graph( &self, - partition_size: u32, + partition_size: u64, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, ) -> Result, Error> { @@ -682,18 +682,18 @@ impl ClusterLayout { let nb_zones = zone_to_id.len(); let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; g.add_edge( Vertex::Source, Vertex::Pdown(p), - (self.replication_factor - redundancy) as u32, + (self.replication_factor - redundancy) as u64, )?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; g.add_edge( Vertex::Pdown(p), Vertex::PZ(p, z), - self.replication_factor as u32, + self.replication_factor as u64, )?; } } @@ -813,17 +813,19 @@ impl ClusterLayout { ) -> Result { let mut msg = Message::new(); - let used_cap = self.partition_size * NB_PARTITIONS as u32 * self.replication_factor as u32; + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push("".into()); msg.push(format!( "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", - used_cap, total_cap, percent_cap + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap )); msg.push("".into()); msg.push( - "If the percentage is to low, it might be that the \ + "If the percentage is too low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ @@ -833,9 +835,9 @@ impl ClusterLayout { msg.push(format!( "Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", - used_cap, + ByteSize::b(used_cap).to_string_as(false), self.replication_factor, - used_cap / self.replication_factor as u32 + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) )); // We define and fill in the following tables @@ -914,34 +916,34 @@ impl ClusterLayout { replicated_partitions )); - let available_cap_z: u32 = self.partition_size * replicated_partitions as u32; + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); msg.push(format!( - " Usable capacity / Total capacity: {}/{} ({:.1}%).", - available_cap_z, total_cap_z, percent_cap_z + " Usable capacity / Total capacity: {} / {} ({:.1}%).", + ByteSize::b(available_cap_z).to_string_as(false), + ByteSize::b(total_cap_z).to_string_as(false), + percent_cap_z )); for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u32 * self.partition_size; + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; let tags_n = (self .node_role(&self.node_id_vec[*n]) .ok_or("Node not found."))? .tags_string(); msg.push(format!( - " Node {}: {} partitions ({} new) ; \ + " Node {:?}: {} partitions ({} new) ; \ usable/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec()[0..2] - .to_vec() - .encode_hex::(), + self.node_id_vec[*n], stored_partitions[*n], new_partitions[*n], - available_cap_n, - total_cap_n, + ByteSize::b(available_cap_n).to_string_as(false), + ByteSize::b(total_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, tags_n )); @@ -1041,7 +1043,7 @@ mod tests { fn update_layout( cl: &mut ClusterLayout, node_id_vec: &Vec, - node_capacity_vec: &Vec, + node_capacity_vec: &Vec, node_zone_vec: &Vec, zone_redundancy: usize, ) { -- cgit v1.2.3 From d75b37b018fc0ce8e3832c8531d9556ff7a345c9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 14:23:08 +0100 Subject: Return more info when layout's .check() fails, fix compilation, fix test --- src/api/admin/cluster.rs | 7 ++--- src/db/lib.rs | 2 -- src/garage/cli/layout.rs | 32 ++++++++++++++-------- src/garage/main.rs | 3 +++ src/rpc/layout.rs | 70 +++++++++++++++++++++++++++++++----------------- src/rpc/system.rs | 6 ++--- 6 files changed, 75 insertions(+), 45 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 040778b1..7b91f709 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -163,16 +163,13 @@ pub async fn handle_apply_cluster_layout( let layout = garage.system.get_cluster_layout(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - //TODO : how to display msg ? Should it be in the Body Response ? - for s in msg.iter() { - println!("{}", s); - } garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) + .header(http::header::CONTENT_TYPE, "text/plain") + .body(Body::from(msg.join("\n")))?) } pub async fn handle_revert_cluster_layout( diff --git a/src/db/lib.rs b/src/db/lib.rs index 0a776a91..5304c195 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -2,8 +2,6 @@ #[cfg(feature = "sqlite")] extern crate tracing; -#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -//compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); #[cfg(feature = "lmdb")] pub mod lmdb_adapter; #[cfg(feature = "sled")] diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 85af345a..53430e6b 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -330,7 +330,7 @@ pub async fn send_layout( } pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { - let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable".to_string()]; + let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { Some(r) => r, @@ -338,16 +338,26 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { }; let tags = role.tags.join(","); let usage = layout.get_node_usage(id).unwrap_or(0); - let capacity = layout.get_node_capacity(id).unwrap_or(1); - table.push(format!( - "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", - id, - tags, - role.zone, - role.capacity_string(), - ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), - (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) - )); + let capacity = layout.get_node_capacity(id).unwrap_or(0); + if capacity > 0 { + table.push(format!( + "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", + id, + tags, + role.zone, + role.capacity_string(), + ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), + (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) + )); + } else { + table.push(format!( + "{:?}\t{}\t{}\t{}", + id, + tags, + role.zone, + role.capacity_string(), + )); + }; } println!(); println!("Parameters of the layout computation:"); diff --git a/src/garage/main.rs b/src/garage/main.rs index edda734b..8e64273f 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -17,6 +17,9 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled #[cfg(all(feature = "bundled-libs", feature = "system-libs"))] compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); +#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] +compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); + use std::net::SocketAddr; use std::path::PathBuf; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 3c80b213..2f4dc129 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -187,11 +187,11 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.retain(|(_, _, v)| v.0.is_some()); self.parameters = self.staging_parameters.get().clone(); - let msg = self.calculate_partition_assignation()?; - self.staging_roles.clear(); self.staging_hash = self.calculate_staging_hash(); + let msg = self.calculate_partition_assignation()?; + self.version += 1; Ok((self, msg)) @@ -214,8 +214,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); self.staging_parameters.update(self.parameters.clone()); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -310,11 +310,11 @@ To know the correct value of the new layout version, invoke `garage layout show` /// Check a cluster layout for internal consistency /// (assignation, roles, parameters, partition size) /// returns true if consistent, false if error - pub fn check(&self) -> bool { + pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { - return false; + return Err("staging_hash is incorrect".into()); } // Check that node_id_vec contains the correct list of nodes @@ -329,12 +329,17 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut node_id_vec = self.node_id_vec.clone(); node_id_vec.sort(); if expected_nodes != node_id_vec { - return false; + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } // Check that the assignation data has the correct length - if self.ring_assignation_data.len() != (1 << PARTITION_BITS) * self.replication_factor { - return false; + let expected_assignation_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignation_data.len() != expected_assignation_data_len { + return Err(format!( + "ring_assignation_data has incorrect length {} instead of {}", + self.ring_assignation_data.len(), + expected_assignation_data_len + )); } // Check that the assigned nodes are correct identifiers @@ -342,12 +347,15 @@ To know the correct value of the new layout version, invoke `garage layout show` // and that role is not the role of a gateway nodes for x in self.ring_assignation_data.iter() { if *x as usize >= self.node_id_vec.len() { - return false; + return Err(format!( + "ring_assignation_data contains invalid node id {}", + *x + )); } let node = self.node_id_vec[*x as usize]; match self.roles.get(&node) { Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return false, + _ => return Err("ring_assignation_data contains id of a gateway node".into()), } } @@ -357,7 +365,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); nodes_of_p.sort(); if nodes_of_p.iter().unique().count() != rf { - return false; + return Err(format!("partition does not contain {} unique node ids", rf)); } // Check that every partition is spread over at least zone_redundancy zones. let mut zones_of_p = nodes_of_p @@ -370,7 +378,10 @@ To know the correct value of the new layout version, invoke `garage layout show` zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; if zones_of_p.iter().unique().count() < redundancy { - return false; + return Err(format!( + "nodes of partition are in less than {} distinct zones", + redundancy + )); } } @@ -382,8 +393,14 @@ To know the correct value of the new layout version, invoke `garage layout show` for (n, usage) in node_usage.iter().enumerate() { if *usage > 0 { let uuid = self.node_id_vec[n]; - if usage * self.partition_size > self.get_node_capacity(&uuid).unwrap() { - return false; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); } } } @@ -393,12 +410,17 @@ To know the correct value of the new layout version, invoke `garage layout show` let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); match cl2.compute_optimal_partition_size(&zone_to_id) { - Ok(s) if s != self.partition_size => return false, - Err(_) => return false, + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), _ => (), } - true + Ok(()) } } @@ -493,9 +515,9 @@ impl ClusterLayout { // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; - if !self.check() { + if let Err(e) = self.check() { return Err(Error::Message( - "Critical error: The computed layout happens to be incorrect".into(), + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) )); } @@ -1062,9 +1084,9 @@ mod tests { ); cl.staging_roles.merge(&update); } - cl.staging_hash = cl.calculate_staging_hash(); cl.staging_parameters .update(LayoutParameters { zone_redundancy }); + cl.staging_hash = cl.calculate_staging_hash(); } #[test] @@ -1081,7 +1103,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; @@ -1094,7 +1116,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; @@ -1102,7 +1124,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![ @@ -1112,7 +1134,7 @@ mod tests { let v = cl.version; let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index d6576f20..224fbabb 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,9 +565,9 @@ impl System { let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); - let prev_layout_check = layout.check(); + let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { - if prev_layout_check && !layout.check() { + if prev_layout_check && !layout.check().is_ok() { error!("New cluster layout is invalid, discarding."); return Err(Error::Message( "New cluster layout is invalid, discarding.".into(), @@ -620,7 +620,7 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = !self.ring.borrow().layout.check(); + let not_configured = !self.ring.borrow().layout.check().is_ok(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self -- cgit v1.2.3 From fc2729cd810b94c47d89e9039ea65726c9a85988 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 15:13:37 +0100 Subject: Fix integration test --- src/garage/tests/common/garage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index 44d727f9..a539abb7 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -126,7 +126,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" self.command() .args(["layout", "assign"]) .arg(node_short_id) - .args(["-c", "1", "-z", "unzonned"]) + .args(["-c", "1G", "-z", "unzonned"]) .quiet() .expect_success_status("Could not assign garage node layout"); self.command() -- cgit v1.2.3 From 217abdca18ff15190c0407b2b8b1ea204edcfb99 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 15:38:53 +0100 Subject: Fix HTTP return code --- src/api/admin/cluster.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 7b91f709..4386c0cc 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -167,7 +167,7 @@ pub async fn handle_apply_cluster_layout( garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() - .status(StatusCode::NO_CONTENT) + .status(StatusCode::OK) .header(http::header::CONTENT_TYPE, "text/plain") .body(Body::from(msg.join("\n")))?) } -- cgit v1.2.3 From ec12d6c8ddde0f1dc908e43fef0ecc88d1e5406b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 16:15:45 +0100 Subject: Slightly simplify code at places --- src/garage/cli/layout.rs | 11 ++++----- src/rpc/layout.rs | 61 ++++++++++++++---------------------------------- 2 files changed, 22 insertions(+), 50 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 53430e6b..27bb7eb8 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -209,16 +209,13 @@ pub async fn cmd_show_layout( "You can also revert all proposed changes with: garage layout revert --version {}", v + 1) } - Err(Error::Message(s)) => { - println!("Error while trying to compute the assignation: {}", s); + Err(e) => { + println!("Error while trying to compute the assignation: {}", e); println!("This new layout cannot yet be applied."); println!( "You can also revert all proposed changes with: garage layout revert --version {}", v + 1) } - _ => { - println!("Unknown Error"); - } } } @@ -355,7 +352,7 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { id, tags, role.zone, - role.capacity_string(), + role.capacity_string() )); }; } @@ -372,7 +369,7 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { - let has_changes = layout.staging_parameters.get().clone() != layout.parameters; + let has_changes = *layout.staging_parameters.get() != layout.parameters; if has_changes { println!(); println!("==== NEW LAYOUT PARAMETERS ===="); diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 2f4dc129..133e33c8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -100,16 +100,7 @@ impl NodeRole { } pub fn tags_string(&self) -> String { - let mut tags = String::new(); - if self.tags.is_empty() { - return tags; - } - tags.push_str(&self.tags[0].clone()); - for t in 1..self.tags.len() { - tags.push(','); - tags.push_str(&self.tags[t].clone()); - } - tags + self.tags.join(",") } } @@ -241,7 +232,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn nongateway_nodes(&self) -> Vec { + fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { @@ -253,7 +244,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid: &Uuid) -> Result { + fn get_node_zone(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(role) => Ok(role.zone.clone()), _ => Err(Error::Message( @@ -299,7 +290,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -494,8 +485,7 @@ impl ClusterLayout { if partition_size < 100 { msg.push( - "WARNING: The partition size is low (< 100), you might consider to \ - provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb)." + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" .into(), ); } @@ -533,7 +523,7 @@ impl ClusterLayout { // (1) We compute the new node list // Non gateway nodes should be coded on 8bits, hence they must be first in the list // We build the new node ids - let mut new_non_gateway_nodes: Vec = self + let new_non_gateway_nodes: Vec = self .roles .items() .iter() @@ -549,7 +539,7 @@ impl ClusterLayout { ))); } - let mut new_gateway_nodes: Vec = self + let new_gateway_nodes: Vec = self .roles .items() .iter() @@ -558,8 +548,8 @@ impl ClusterLayout { .collect(); let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.append(&mut new_non_gateway_nodes); - new_node_id_vec.append(&mut new_gateway_nodes); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); let old_node_id_vec = self.node_id_vec.clone(); self.node_id_vec = new_node_id_vec.clone(); @@ -567,12 +557,11 @@ impl ClusterLayout { // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition // to node assignations where the node is still in use. - let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; - if self.ring_assignation_data.is_empty() { // This is a new association return Ok(None); } + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "The old assignation does not have a size corresponding to \ @@ -590,7 +579,9 @@ impl ClusterLayout { uuid_to_new_id.insert(*uuid, i); } + let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; let rf = self.replication_factor; + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { let uuid = old_node_id_vec[*old_id as usize]; @@ -613,18 +604,10 @@ impl ClusterLayout { let mut zone_to_id = HashMap::::new(); for uuid in self.nongateway_nodes().iter() { - if self.roles.get(uuid) == None { - return Err(Error::Message( - "The uuid was not found in the node roles (this should \ - not happen, it might be a critical error)." - .into(), - )); - } - if let Some(r) = self.node_role(uuid) { - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone(), id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); } } Ok((id_to_zone, zone_to_id)) @@ -639,11 +622,7 @@ impl ClusterLayout { let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; - if g.get_flow_value()? - < (NB_PARTITIONS * self.replication_factor) - .try_into() - .unwrap() - { + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { return Err(Error::Message( "The storage capacity of he cluster is to small. It is \ impossible to store partitions of size 1." @@ -656,11 +635,7 @@ impl ClusterLayout { while s_down + 1 < s_up { g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; - if g.get_flow_value()? - < (NB_PARTITIONS * self.replication_factor) - .try_into() - .unwrap() - { + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { s_up = (s_down + s_up) / 2; } else { s_down = (s_down + s_up) / 2; -- cgit v1.2.3