From c1d1646c4d62300ec48503aa65623ee7e3df8685 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 09:54:19 +0200 Subject: Change the way new layout assignations are computed. The function now computes an optimal assignation (with respect to partition size) that minimizes the distance to the former assignation, using flow algorithms. This commit was written by Mendes Oulamara --- src/rpc/Cargo.toml | 1 + src/rpc/layout.rs | 881 +++++++++++++++++++++++++------------------------- src/util/bipartite.rs | 378 ++++++++++++++++++++++ src/util/lib.rs | 1 + 4 files changed, 827 insertions(+), 434 deletions(-) create mode 100644 src/util/bipartite.rs diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index efaacf2e..654c1dc6 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -23,6 +23,7 @@ gethostname = "0.2" hex = "0.4" tracing = "0.1.30" rand = "0.8" +itertools="0.10" sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } async-trait = "0.1.7" diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index b9c02c21..afd7df17 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,10 +1,14 @@ use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; +use std::cmp::{min}; +use std::collections::{HashMap}; use serde::{Deserialize, Serialize}; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; +use garage_util::bipartite::*; + +use rand::prelude::SliceRandom; use crate::ring::*; @@ -164,445 +168,454 @@ impl ClusterLayout { true } - /// Calculate an assignation of partitions to nodes - pub fn calculate_partition_assignation(&mut self) -> bool { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); - - println!("Calculating updated partition assignation, this may take some time..."); - println!(); - - // Get old partition assignation - let old_partitions = self.parse_assignation_data(); - - // Start new partition assignation with nodes from old assignation where it is relevant - let mut partitions = old_partitions - .iter() - .map(|old_part| { - let mut new_part = PartitionAss::new(); - for node in old_part.nodes.iter() { - if let Some(role) = node.1 { - if role.capacity.is_some() { - new_part.add(None, n_zones, node.0, role); - } - } - } - new_part - }) - .collect::>(); - - // In various cases, not enough nodes will have been added for all partitions - // in the step above (e.g. due to node removals, or new zones being added). - // Here we add more nodes to make a complete (but sub-optimal) assignation, - // using an initial partition assignation that is calculated using the multi-dc maglev trick - match self.initial_partition_assignation() { - Some(initial_partitions) => { - for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) { - for (id, info) in ipart.nodes.iter() { - if part.nodes.len() < self.replication_factor { - part.add(None, n_zones, id, info.unwrap()); - } - } - assert!(part.nodes.len() == self.replication_factor); - } - } - None => { - // Not enough nodes in cluster to build a correct assignation. - // Signal it by returning an error. - return false; - } - } - - // Calculate how many partitions each node should ideally store, - // and how many partitions they are storing with the current assignation - // This defines our target for which we will optimize in the following loop. - let total_capacity = configured_nodes - .iter() - .map(|(_, info)| info.capacity.unwrap_or(0)) - .sum::() as usize; - let total_partitions = self.replication_factor * (1 << PARTITION_BITS); - let target_partitions_per_node = configured_nodes - .iter() - .map(|(id, info)| { - ( - *id, - info.capacity.unwrap_or(0) as usize * total_partitions / total_capacity, - ) - }) - .collect::>(); - - let mut partitions_per_node = self.partitions_per_node(&partitions[..]); - - println!("Target number of partitions per node:"); - for (node, npart) in target_partitions_per_node.iter() { - println!("{:?}\t{}", node, npart); - } - println!(); - - // Shuffle partitions between nodes so that nodes will reach (or better approach) - // their target number of stored partitions - loop { - let mut option = None; - for (i, part) in partitions.iter_mut().enumerate() { - for (irm, (idrm, _)) in part.nodes.iter().enumerate() { - let errratio = |node, parts| { - let tgt = *target_partitions_per_node.get(node).unwrap() as f32; - (parts - tgt) / tgt - }; - let square = |x| x * x; - - let partsrm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as f32; - - for (idadd, infoadd) in configured_nodes.iter() { - // skip replacing a node by itself - // and skip replacing by gateway nodes - if idadd == idrm || infoadd.capacity.is_none() { - continue; - } - - // We want to try replacing node idrm by node idadd - // if that brings us close to our goal. - let partsadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as f32; - let oldcost = square(errratio(*idrm, partsrm) - errratio(*idadd, partsadd)); - let newcost = - square(errratio(*idrm, partsrm - 1.) - errratio(*idadd, partsadd + 1.)); - if newcost >= oldcost { - // not closer to our goal - continue; - } - let gain = oldcost - newcost; - - let mut newpart = part.clone(); - - newpart.nodes.remove(irm); - if !newpart.add(None, n_zones, idadd, infoadd) { - continue; - } - assert!(newpart.nodes.len() == self.replication_factor); - - if !old_partitions[i] - .is_valid_transition_to(&newpart, self.replication_factor) - { - continue; - } - - if option - .as_ref() - .map(|(old_gain, _, _, _, _)| gain > *old_gain) - .unwrap_or(true) - { - option = Some((gain, i, idadd, idrm, newpart)); - } - } - } - } - if let Some((_gain, i, idadd, idrm, newpart)) = option { - *partitions_per_node.entry(idadd).or_insert(0) += 1; - *partitions_per_node.get_mut(idrm).unwrap() -= 1; - partitions[i] = newpart; - } else { - break; - } - } - // Check we completed the assignation correctly - // (this is a set of checks for the algorithm's consistency) - assert!(partitions.len() == (1 << PARTITION_BITS)); - assert!(partitions - .iter() - .all(|p| p.nodes.len() == self.replication_factor)); - - let new_partitions_per_node = self.partitions_per_node(&partitions[..]); - assert!(new_partitions_per_node == partitions_per_node); - - // Show statistics - println!("New number of partitions per node:"); - for (node, npart) in partitions_per_node.iter() { - let tgt = *target_partitions_per_node.get(node).unwrap(); - let pct = 100f32 * (*npart as f32) / (tgt as f32); - println!("{:?}\t{}\t({}% of {})", node, npart, pct as i32, tgt); - } - println!(); - - let mut diffcount = HashMap::new(); - for (oldpart, newpart) in old_partitions.iter().zip(partitions.iter()) { - let nminus = oldpart.txtplus(newpart); - let nplus = newpart.txtplus(oldpart); - if nminus != "[...]" || nplus != "[...]" { - let tup = (nminus, nplus); - *diffcount.entry(tup).or_insert(0) += 1; - } - } - if diffcount.is_empty() { - println!("No data will be moved between nodes."); - } else { - let mut diffcount = diffcount.into_iter().collect::>(); - diffcount.sort(); - println!("Number of partitions that move:"); - for ((nminus, nplus), npart) in diffcount { - println!("\t{}\t{} -> {}", npart, nminus, nplus); - } - } - println!(); - - // Calculate and save new assignation data - let (nodes, assignation_data) = - self.compute_assignation_data(&configured_nodes[..], &partitions[..]); - - self.node_id_vec = nodes; - self.ring_assignation_data = assignation_data; - - true - } - - fn initial_partition_assignation(&self) -> Option>> { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); - - // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) - let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::>(); - - // Prepare ring - let mut partitions: Vec = partitions_idx - .iter() - .map(|_i| PartitionAss::new()) - .collect::>(); - - // Create MagLev priority queues for each node - let mut queues = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(node_id, node_info)| { - let mut parts = partitions_idx - .iter() - .map(|i| { - let part_data = - [&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat(); - (*i, fasthash(&part_data[..])) - }) - .collect::>(); - parts.sort_by_key(|(_i, h)| *h); - let parts_i = parts.iter().map(|(i, _h)| *i).collect::>(); - (node_id, node_info, parts_i, 0) - }) - .collect::>(); - - let max_capacity = configured_nodes - .iter() - .filter_map(|(_, node_info)| node_info.capacity) - .fold(0, std::cmp::max); - - // Fill up ring - for rep in 0..self.replication_factor { - queues.sort_by_key(|(ni, _np, _q, _p)| { - let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat(); - fasthash(&queue_data[..]) - }); - - for (_, _, _, pos) in queues.iter_mut() { - *pos = 0; - } - - let mut remaining = partitions_idx.len(); - while remaining > 0 { - let remaining0 = remaining; - for i_round in 0..max_capacity { - for (node_id, node_info, q, pos) in queues.iter_mut() { - if i_round >= node_info.capacity.unwrap() { - continue; - } - for (pos2, &qv) in q.iter().enumerate().skip(*pos) { - if partitions[qv].add(Some(rep + 1), n_zones, node_id, node_info) { - remaining -= 1; - *pos = pos2 + 1; - break; - } - } - } - } - if remaining == remaining0 { - // No progress made, exit - return None; - } - } - } - - Some(partitions) - } + /// This function calculates a new partition-to-node assignation. + /// The computed assignation maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignation, it minimizes the distance to + /// the former assignation (if any) to minimize the amount of + /// data to be moved. A heuristic ensures node triplets + /// dispersion (in garage_util::bipartite::optimize_matching()). + pub fn calculate_partition_assignation(&mut self) -> bool { + + //The nodes might have been updated, some might have been deleted. + //So we need to first update the list of nodes and retrieve the + //assignation. + let old_node_assignation = self.update_nodes_and_ring(); + + let (node_zone, _) = self.get_node_zone_capacity(); + + //We compute the optimal number of partition to assign to + //every node and zone. + if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions(){ + //We collect part_per_zone in a vec to not rely on the + //arbitrary order in which elements are iterated in + //Hashmap::iter() + let part_per_zone_vec = part_per_zone.iter() + .map(|(x,y)| (x.clone(),*y)) + .collect::>(); + //We create an indexing of the zones + let mut zone_id = HashMap::::new(); + for i in 0..part_per_zone_vec.len(){ + zone_id.insert(part_per_zone_vec[i].0.clone(), i); + } + + //We compute a candidate for the new partition to zone + //assignation. + let nb_zones = part_per_zone.len(); + let nb_nodes = part_per_nod.len(); + let nb_partitions = 1<> = + old_node_assignation.iter().map(|x| x.iter().map( + |id| match *id { Some(i) => zone_id[&node_zone[i]] , + None => no_zone } + ).collect()).collect(); + + //We minimize the distance to the former zone assignation + zone_assignation = optimize_matching( + &old_zone_assignation, &zone_assignation, nb_zones+1); //+1 for no_zone + + //We need to assign partitions to nodes in their zone + //We first put the nodes assignation that can stay the same + for i in 0..nb_partitions{ + for j in 0..self.replication_factor { + if let Some(Some(former_node)) = old_node_assignation[i].iter().find( + |x| if let Some(id) = x { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + } + else {false} + ) + { + if part_per_nod[*former_node] > 0 { + node_assignation[i][j] = Some(*former_node); + part_per_nod[*former_node] -= 1; + } + } + } + } + + + //We complete the assignation of partitions to nodes + let mut rng = rand::thread_rng(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if node_assignation[i][j] == None { + let possible_nodes : Vec = (0..nb_nodes) + .filter( + |id| zone_id[&node_zone[*id]] == zone_assignation[i][j] + && part_per_nod[*id] > 0).collect(); + assert!(possible_nodes.len()>0); + //We randomly pick a node + if let Some(nod) = possible_nodes.choose(&mut rng){ + node_assignation[i][j] = Some(*nod); + part_per_nod[*nod] -= 1; + } + } + } + } + + //We write the assignation in the 1D table + self.ring_assignation_data = Vec::::new(); + for i in 0..nb_partitions{ + for j in 0..self.replication_factor { + if let Some(id) = node_assignation[i][j] { + self.ring_assignation_data.push(id as CompactNodeType); + } + else {assert!(false)} + } + } + + true + } + else { false } + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignation given by ring, with the new indices of the nodes, and + /// None of the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignation + /// do modify assignation_ring and node_id_vec. + fn update_nodes_and_ring(&mut self) -> Vec>> { + let nb_partitions = 1usize< = self.roles.items().iter() + .map(|(k, _, _)| *k) + .collect(); + + if ring.len() == rf*nb_partitions { + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + node_assignation[i][j] = new_node_id_vec.iter() + .position(|id| *id == self.node_id_vec[ring[i*rf + j] as usize]); + } + } + } + + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = vec![]; + return node_assignation; + } + + ///This function compute the number of partition to assign to + ///every node and zone, so that every partition is replicated + ///self.replication_factor times and the capacity of a partition + ///is maximized. + fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { + + let mut zone_capacity :HashMap= HashMap::new(); + + let (node_zone, node_capacity) = self.get_node_zone_capacity(); + let nb_nodes = self.node_id_vec.len(); + + for i in 0..nb_nodes + { + if zone_capacity.contains_key(&node_zone[i]) { + zone_capacity.insert(node_zone[i].clone(), zone_capacity[&node_zone[i]] + node_capacity[i]); + } + else{ + zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); + } + } + + //Compute the optimal number of partitions per zone + let sum_capacities: u32 =zone_capacity.values().sum(); + + if sum_capacities <= 0 { + println!("No storage capacity in the network."); + return None; + } + + let nb_partitions = 1< = + zone_capacity.iter() + .map(|(k, v)| (k.clone(), min(nb_partitions, + (self.replication_factor*nb_partitions + **v as usize)/sum_capacities as usize) ) ).collect(); + + //The replication_factor-1 upper bounds the number of + //part_per_zones that are greater than nb_partitions + for _ in 1..self.replication_factor { + //The number of partitions that are not assignated to + //a zone that takes nb_partitions. + let sum_capleft : u32 = zone_capacity.keys() + .filter(| k | {part_per_zone[*k] < nb_partitions} ) + .map(|k| zone_capacity[k]).sum(); + + //The number of replication of the data that we need + //to ensure. + let repl_left = self.replication_factor + - part_per_zone.values() + .filter(|x| {**x == nb_partitions}) + .count(); + if repl_left == 0 { + break; + } + + for k in zone_capacity.keys() { + if part_per_zone[k] != nb_partitions + { + part_per_zone.insert(k.to_string() , min(nb_partitions, + (nb_partitions*zone_capacity[k] as usize + *repl_left)/sum_capleft as usize)); + } + } + } + + //Now we divide the zone's partition share proportionally + //between their nodes. + + let mut part_per_nod : Vec = (0..nb_nodes).map( + |i| (part_per_zone[&node_zone[i]]*node_capacity[i] as usize)/zone_capacity[&node_zone[i]] as usize + ) + .collect(); + + //We must update the part_per_zone to make it correspond to + //part_per_nod (because of integer rounding) + part_per_zone = part_per_zone.iter().map(|(k,_)| + (k.clone(), 0)) + .collect(); + for i in 0..nb_nodes { + part_per_zone.insert( + node_zone[i].clone() , + part_per_zone[&node_zone[i]] + part_per_nod[i]); + } + + //Because of integer rounding, the total sum of part_per_nod + //might not be replication_factor*nb_partitions. + // We need at most to add 1 to every non maximal value of + // part_per_nod. The capacity of a partition will be bounded + // by the minimal value of + // node_capacity_vec[i]/part_per_nod[i] + // so we try to maximize this minimal value, keeping the + // part_per_zone capped + + let discrepancy : usize = + nb_partitions*self.replication_factor + - part_per_nod.iter().sum::(); + + //We use a stupid O(N^2) algorithm. If the number of nodes + //is actually expected to be high, one should optimize this. + + for _ in 0..discrepancy { + if let Some(idmax) = (0..nb_nodes) + .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) + .max_by( |i,j| + (node_capacity[*i]*(part_per_nod[*j]+1) as u32) + .cmp(&(node_capacity[*j]*(part_per_nod[*i]+1) as u32)) + ) + { + part_per_nod[idmax] += 1; + part_per_zone.insert(node_zone[idmax].clone(),part_per_zone[&node_zone[idmax]]+1); + } + } + + //We check the algorithm consistency + + let discrepancy : usize = + nb_partitions*self.replication_factor + - part_per_nod.iter().sum::(); + assert!(discrepancy == 0); + assert!(if let Some(v) = part_per_zone.values().max() + {*v <= nb_partitions} else {false} ); + + Some((part_per_nod, part_per_zone)) + } + + + //Returns vectors of zone and capacity; indexed by the same (temporary) + //indices as node_id_vec. + fn get_node_zone_capacity(& self) -> (Vec , Vec) { + + let node_zone = self.node_id_vec.iter().map( + |id_nod| match self.node_role(id_nod) { + Some(NodeRole{zone,capacity:_,tags:_}) => zone.clone() , + _ => "".to_string() + } + ).collect(); + + let node_capacity = self.node_id_vec.iter().map( + |id_nod| match self.node_role(id_nod) { + Some(NodeRole{zone:_,capacity,tags:_}) => + if let Some(c)=capacity + {*c} + else {0}, + _ => 0 + } + ).collect(); + + (node_zone,node_capacity) + } - fn configured_nodes_and_zones(&self) -> (Vec<(&Uuid, &NodeRole)>, HashSet<&str>) { - let configured_nodes = self - .roles - .items() - .iter() - .filter(|(_id, _, info)| info.0.is_some()) - .map(|(id, _, info)| (id, info.0.as_ref().unwrap())) - .collect::>(); - - let zones = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(_id, info)| info.zone.as_str()) - .collect::>(); - - (configured_nodes, zones) - } - - fn compute_assignation_data<'a>( - &self, - configured_nodes: &[(&'a Uuid, &'a NodeRole)], - partitions: &[PartitionAss<'a>], - ) -> (Vec, Vec) { - assert!(partitions.len() == (1 << PARTITION_BITS)); - - // Make a canonical order for nodes - let mut nodes = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(id, _)| **id) - .collect::>(); - let nodes_rev = nodes - .iter() - .enumerate() - .map(|(i, id)| (*id, i as CompactNodeType)) - .collect::>(); - - let mut assignation_data = vec![]; - for partition in partitions.iter() { - assert!(partition.nodes.len() == self.replication_factor); - for (id, _) in partition.nodes.iter() { - assignation_data.push(*nodes_rev.get(id).unwrap()); - } - } - - nodes.extend( - configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_none()) - .map(|(id, _)| **id), - ); - - (nodes, assignation_data) - } - - fn parse_assignation_data(&self) -> Vec> { - if self.ring_assignation_data.len() == self.replication_factor * (1 << PARTITION_BITS) { - // If the previous assignation data is correct, use that - let mut partitions = vec![]; - for i in 0..(1 << PARTITION_BITS) { - let mut part = PartitionAss::new(); - for node_i in self.ring_assignation_data - [i * self.replication_factor..(i + 1) * self.replication_factor] - .iter() - { - let node_id = &self.node_id_vec[*node_i as usize]; - - if let Some(NodeRoleV(Some(info))) = self.roles.get(node_id) { - part.nodes.push((node_id, Some(info))); - } else { - part.nodes.push((node_id, None)); - } - } - partitions.push(part); - } - partitions - } else { - // Otherwise start fresh - (0..(1 << PARTITION_BITS)) - .map(|_| PartitionAss::new()) - .collect() - } - } - - fn partitions_per_node<'a>(&self, partitions: &[PartitionAss<'a>]) -> HashMap<&'a Uuid, usize> { - let mut partitions_per_node = HashMap::<&Uuid, usize>::new(); - for p in partitions.iter() { - for (id, _) in p.nodes.iter() { - *partitions_per_node.entry(*id).or_insert(0) += 1; - } - } - partitions_per_node - } -} - -// ---- Internal structs for partition assignation in layout ---- - -#[derive(Clone)] -struct PartitionAss<'a> { - nodes: Vec<(&'a Uuid, Option<&'a NodeRole>)>, } -impl<'a> PartitionAss<'a> { - fn new() -> Self { - Self { nodes: Vec::new() } - } - fn nplus(&self, other: &PartitionAss<'a>) -> usize { - self.nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .count() - } - fn txtplus(&self, other: &PartitionAss<'a>) -> String { - let mut nodes = self - .nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .map(|x| format!("{:?}", x.0)) - .collect::>(); - nodes.sort(); - if self.nodes.iter().any(|x| other.nodes.contains(x)) { - nodes.push("...".into()); - } - format!("[{}]", nodes.join(" ")) - } +#[cfg(test)] +mod tests { + use super::*; + use itertools::Itertools; + + fn check_assignation(cl : &ClusterLayout) { + + //Check that input data has the right format + let nb_partitions = 1usize<>(); + + let zone_vec = node_zone.iter().unique().collect::>(); + let zone_nb_part = zone_vec.iter().map( |z| cl.ring_assignation_data.iter() + .filter(|x| node_zone[**x as usize] == **z) + .count() + ).collect::>(); + + //Check optimality of the zone assignation : would it be better for the + //node_capacity/node_partitions ratio to change the assignation of a partition + + if let Some(idmin) = (0..nb_nodes).min_by( + |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) + ){ + if let Some(idnew) = (0..nb_nodes) + .filter( |i| if let Some(p) = zone_vec.iter().position(|z| **z==node_zone[*i]) + {zone_nb_part[p] < nb_partitions } + else { false }) + .max_by( + |i,j| (node_capacity[*i]*(node_nb_part[*j]as u32+1)) + .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) + ){ + assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= + node_capacity[idnew]*node_nb_part[idmin] as u32); + } + + } + + //In every zone, check optimality of the nod assignation + for z in zone_vec { + let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z ); + if let Some(idmin) = node_of_z_iter.clone().min_by( + |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) + ){ + if let Some(idnew) = node_of_z_iter.min_by( + |i,j| (node_capacity[*i]*(node_nb_part[*j] as u32+1)) + .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) + ){ + assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= + node_capacity[idnew]*node_nb_part[idmin] as u32); + } + } + } + + } + + fn update_layout(cl : &mut ClusterLayout, node_id_vec : &Vec, + node_capacity_vec : &Vec , node_zone_vec : &Vec) { + for i in 0..node_id_vec.len(){ + if let Some(x) = FixedBytes32::try_from(&[i as u8;32]) { + cl.node_id_vec.push(x); + } + + let update = cl.roles.update_mutator(cl.node_id_vec[i] , + NodeRoleV(Some(NodeRole{ + zone : (node_zone_vec[i].to_string()), + capacity : (Some(node_capacity_vec[i])), + tags : (vec![])}))); + cl.roles.merge(&update); + } + } + + #[test] + fn test_assignation() { + + let mut node_id_vec = vec![1,2,3]; + let mut node_capacity_vec = vec![4000,1000,2000]; + let mut node_zone_vec= vec!["A", "B", "C"].into_iter().map(|x| x.to_string()).collect(); + + let mut cl = ClusterLayout { + node_id_vec: vec![], + + roles : LwwMap::new(), + + replication_factor: 3, + ring_assignation_data : vec![], + version:0, + staging: LwwMap::new(), + staging_hash: sha256sum(&[1;32]), + }; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_id_vec = vec![1,2,3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000,1000,1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec= vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"].into_iter().map(|x| x.to_string()).collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000,1000,2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + + node_capacity_vec = vec![4000,4000,2000, 7000, 1000, 9000, 2000, 10, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + } +} - fn is_valid_transition_to(&self, other: &PartitionAss<'a>, replication_factor: usize) -> bool { - let min_keep_nodes_per_part = (replication_factor + 1) / 2; - let n_removed = self.nplus(other); - if self.nodes.len() <= min_keep_nodes_per_part { - n_removed == 0 - } else { - n_removed <= self.nodes.len() - min_keep_nodes_per_part - } - } - // add is a key function in creating a PartitionAss, i.e. the list of nodes - // to which a partition is assigned. It tries to add a certain node id to the - // assignation, but checks that doing so is compatible with the NECESSARY - // condition that the partition assignation must be dispersed over different - // zones (datacenters) if enough zones exist. This is why it takes a n_zones - // parameter, which is the total number of zones that have existing nodes: - // if nodes in the assignation already cover all n_zones zones, then any node - // that is not yet in the assignation can be added. Otherwise, only nodes - // that are in a new zone can be added. - fn add( - &mut self, - target_len: Option, - n_zones: usize, - node: &'a Uuid, - role: &'a NodeRole, - ) -> bool { - if let Some(tl) = target_len { - if self.nodes.len() != tl - 1 { - return false; - } - } - - let p_zns = self - .nodes - .iter() - .map(|(_id, info)| info.unwrap().zone.as_str()) - .collect::>(); - if (p_zns.len() < n_zones && !p_zns.contains(&role.zone.as_str())) - || (p_zns.len() == n_zones && !self.nodes.iter().any(|(id, _)| *id == node)) - { - self.nodes.push((node, Some(role))); - true - } else { - false - } - } -} diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs new file mode 100644 index 00000000..aec7b042 --- /dev/null +++ b/src/util/bipartite.rs @@ -0,0 +1,378 @@ +/* + * This module deals with graph algorithm in complete bipartite + * graphs. It is used in layout.rs to build the partition to node + * assignation. + * */ + +use std::cmp::{min,max}; +use std::collections::VecDeque; +use rand::prelude::SliceRandom; + +//Graph data structure for the flow algorithm. +#[derive(Clone,Copy,Debug)] +struct EdgeFlow{ + c : i32, + flow : i32, + v : usize, + rev : usize, +} + +//Graph data structure for the detection of positive cycles. +#[derive(Clone,Copy,Debug)] +struct WeightedEdge{ + w : i32, + u : usize, + v : usize, +} + + +/* This function takes two matchings (old_match and new_match) in a + * complete bipartite graph. It returns a matching that has the + * same degree as new_match at every vertex, and that is as close + * as possible to old_match. + * */ +pub fn optimize_matching( old_match : &Vec> , + new_match : &Vec> , + nb_right : usize ) + -> Vec> { + let nb_left = old_match.len(); + let ed = WeightedEdge{w:-1,u:0,v:0}; + let mut edge_vec = vec![ed ; nb_left*nb_right]; + + //We build the complete bipartite graph structure, represented + //by the list of all edges. + for i in 0..nb_left { + for j in 0..nb_right{ + edge_vec[i*nb_right + j].u = i; + edge_vec[i*nb_right + j].v = nb_left+j; + } + } + + for i in 0..edge_vec.len() { + //We add the old matchings + if old_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { + edge_vec[i].w *= -1; + } + //We add the new matchings + if new_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { + (edge_vec[i].u,edge_vec[i].v) = + (edge_vec[i].v,edge_vec[i].u); + edge_vec[i].w *= -1; + } + } + //Now edge_vec is a graph where edges are oriented LR if we + //can add them to new_match, and RL otherwise. If + //adding/removing them makes the matching closer to old_match + //they have weight 1; and -1 otherwise. + + //We shuffle the edge list so that there is no bias depending in + //partitions/zone label in the triplet dispersion + let mut rng = rand::thread_rng(); + edge_vec.shuffle(&mut rng); + + //Discovering and flipping a cycle with positive weight in this + //graph will make the matching closer to old_match. + //We use Bellman Ford algorithm to discover positive cycles + loop{ + if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { + for i in cycle { + //We flip the edges of the cycle. + (edge_vec[i].u,edge_vec[i].v) = + (edge_vec[i].v,edge_vec[i].u); + edge_vec[i].w *= -1; + } + } + else { + //If there is no cycle, we return the optimal matching. + break; + } + } + + //The optimal matching is build from the graph structure. + let mut matching = vec![Vec::::new() ; nb_left]; + for e in edge_vec { + if e.u > e.v { + matching[e.v].push(e.u-nb_left); + } + } + matching +} + +//This function finds a positive cycle in a bipartite wieghted graph. +fn positive_cycle( edge_vec : &Vec, nb_left : usize, + nb_right : usize) -> Option> { + let nb_side_min = min(nb_left, nb_right); + let nb_vertices = nb_left+nb_right; + let weight_lowerbound = -((nb_left +nb_right) as i32) -1; + let mut accessed = vec![false ; nb_left]; + + //We try to find a positive cycle accessible from the left + //vertex i. + for i in 0..nb_left{ + if accessed[i] { + continue; + } + let mut weight =vec![weight_lowerbound ; nb_vertices]; + let mut prev =vec![ edge_vec.len() ; nb_vertices]; + weight[i] = 0; + //We compute largest weighted paths from i. + //Since the graph is bipartite, any simple cycle has length + //at most 2*nb_side_min. In the general Bellman-Ford + //algorithm, the bound here is the number of vertices. Since + //the number of partitions can be much larger than the + //number of nodes, we optimize that. + for _ in 0..(2*nb_side_min) { + for j in 0..edge_vec.len() { + let e = edge_vec[j]; + if weight[e.v] < weight[e.u]+e.w { + weight[e.v] = weight[e.u]+e.w; + prev[e.v] = j; + } + } + } + //We update the accessed table + for i in 0..nb_left { + if weight[i] > weight_lowerbound { + accessed[i] = true; + } + } + //We detect positive cycle + for e in edge_vec { + if weight[e.v] < weight[e.u]+e.w { + //it means e is on a path branching from a positive cycle + let mut was_seen = vec![false ; nb_vertices]; + let mut curr = e.u; + //We track back with prev until we reach the cycle. + while !was_seen[curr]{ + was_seen[curr] = true; + curr = edge_vec[prev[curr]].u; + } + //Now curr is on the cycle. We collect the edges ids. + let mut cycle = Vec::::new(); + cycle.push(prev[curr]); + let mut cycle_vert = edge_vec[prev[curr]].u; + while cycle_vert != curr { + cycle.push(prev[cycle_vert]); + cycle_vert = edge_vec[prev[cycle_vert]].u; + } + + return Some(cycle); + } + } + } + + None +} + + +// This function takes two arrays of capacity and computes the +// maximal matching in the complete bipartite graph such that the +// left vertex i is matched to left_cap_vec[i] right vertices, and +// the right vertex j is matched to right_cap_vec[j] left vertices. +// To do so, we use Dinic's maximum flow algorithm. +pub fn dinic_compute_matching( left_cap_vec : Vec, + right_cap_vec : Vec) -> Vec< Vec > +{ + let mut graph = Vec:: >::new(); + let ed = EdgeFlow{c:0,flow:0,v:0, rev:0}; + + // 0 will be the source + graph.push(vec![ed ; left_cap_vec.len()]); + for i in 0..left_cap_vec.len() + { + graph[0][i].c = left_cap_vec[i] as i32; + graph[0][i].v = i+2; + graph[0][i].rev = 0; + } + + //1 will be the sink + graph.push(vec![ed ; right_cap_vec.len()]); + for i in 0..right_cap_vec.len() + { + graph[1][i].c = right_cap_vec[i] as i32; + graph[1][i].v = i+2+left_cap_vec.len(); + graph[1][i].rev = 0; + } + + //we add left vertices + for i in 0..left_cap_vec.len() { + graph.push(vec![ed ; 1+right_cap_vec.len()]); + graph[i+2][0].c = 0; //directed + graph[i+2][0].v = 0; + graph[i+2][0].rev = i; + + for j in 0..right_cap_vec.len() { + graph[i+2][j+1].c = 1; + graph[i+2][j+1].v = 2+left_cap_vec.len()+j; + graph[i+2][j+1].rev = i+1; + } + } + + //we add right vertices + for i in 0..right_cap_vec.len() { + let lft_ln = left_cap_vec.len(); + graph.push(vec![ed ; 1+lft_ln]); + graph[i+lft_ln+2][0].c = graph[1][i].c; + graph[i+lft_ln+2][0].v = 1; + graph[i+lft_ln+2][0].rev = i; + + for j in 0..left_cap_vec.len() { + graph[i+2+lft_ln][j+1].c = 0; //directed + graph[i+2+lft_ln][j+1].v = j+2; + graph[i+2+lft_ln][j+1].rev = i+1; + } + } + + //To ensure the dispersion of the triplets generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //left vertices do not consider the right ones in the same order. + let mut rng = rand::thread_rng(); + for i in 0..graph.len() { + graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..graph[i].len() { + let target_v = graph[i][j].v; + let target_rev = graph[i][j].rev; + graph[target_v][target_rev].rev = j; + } + } + + let nb_vertices = graph.len(); + + //We run Dinic's max flow algorithm + loop{ + //We build the level array from Dinic's algorithm. + let mut level = vec![-1; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((0,0)); + while !fifo.is_empty() { + if let Some((id,lvl)) = fifo.pop_front(){ + if level[id] == -1 { + level[id] = lvl; + for e in graph[id].iter(){ + if e.c-e.flow > 0{ + fifo.push_back((e.v,lvl+1)); + } + } + } + } + } + if level[1] == -1 { + //There is no residual flow + break; + } + + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + let flow_upper_bound; + if let Some(x) = left_cap_vec.iter().max() { + flow_upper_bound=*x as i32; + } + else { + flow_upper_bound = 0; + assert!(false); + } + + lifo.push_back((0,flow_upper_bound)); + + loop + { + if let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == 1 { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id,_)) = lifo.pop_back(){ + let nbd=next_nbd[id]; + graph[id][nbd].flow += f; + let id_v = graph[id][nbd].v; + let nbd_v = graph[id][nbd].rev; + graph[id_v][nbd_v].flow -= f; + } + } + lifo.push_back((0,flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back(){ + next_nbd[*parent] +=1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f,graph[id][nbd].c + - graph[id][nbd].flow); + if level[graph[id][nbd].v] <= level[id] || + new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + //otherwise, we send flow to nbd. + lifo.push_back((graph[id][nbd].v, new_flow)); + } + else { + break; + } + } + } + + //We return the association + let assoc_table = (0..left_cap_vec.len()).map( + |id| graph[id+2].iter() + .filter(|e| e.flow > 0) + .map( |e| e.v-2-left_cap_vec.len()) + .collect()).collect(); + + //consistency check + + //it is a flow + for i in 3..graph.len(){ + assert!( graph[i].iter().map(|e| e.flow).sum::() == 0); + for e in graph[i].iter(){ + assert!(e.flow + graph[e.v][e.rev].flow == 0); + } + } + + //it solves the matching problem + for i in 0..left_cap_vec.len(){ + assert!(left_cap_vec[i] as i32 == + graph[i+2].iter().map(|e| max(0,e.flow)).sum::()); + } + for i in 0..right_cap_vec.len(){ + assert!(right_cap_vec[i] as i32 == + graph[i+2+left_cap_vec.len()].iter() + .map(|e| max(0,e.flow)).sum::()); + } + + + assoc_table +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_flow() { + let left_vec = vec![3;8]; + let right_vec = vec![0,4,8,4,8]; + //There are asserts in the function that computes the flow + let _ = dinic_compute_matching(left_vec, right_vec); + } + + //maybe add tests relative to the matching optilization ? +} + + diff --git a/src/util/lib.rs b/src/util/lib.rs index e83fc2e6..891549c3 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -4,6 +4,7 @@ extern crate tracing; pub mod background; +pub mod bipartite; pub mod config; pub mod crdt; pub mod data; -- cgit v1.2.3 From 2aeaddd5e2e1911b084f6d49ccb2236b7fec31af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 09:57:05 +0200 Subject: Apply cargo fmt --- src/rpc/layout.rs | 940 ++++++++++++++++++++++++++------------------------ src/util/bipartite.rs | 694 +++++++++++++++++++------------------ 2 files changed, 842 insertions(+), 792 deletions(-) diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index afd7df17..ac31da72 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,12 +1,12 @@ +use std::cmp::min; use std::cmp::Ordering; -use std::cmp::{min}; -use std::collections::{HashMap}; +use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use garage_util::bipartite::*; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; -use garage_util::bipartite::*; use rand::prelude::SliceRandom; @@ -168,454 +168,506 @@ impl ClusterLayout { true } + /// This function calculates a new partition-to-node assignation. + /// The computed assignation maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignation, it minimizes the distance to + /// the former assignation (if any) to minimize the amount of + /// data to be moved. A heuristic ensures node triplets + /// dispersion (in garage_util::bipartite::optimize_matching()). + pub fn calculate_partition_assignation(&mut self) -> bool { + //The nodes might have been updated, some might have been deleted. + //So we need to first update the list of nodes and retrieve the + //assignation. + let old_node_assignation = self.update_nodes_and_ring(); + + let (node_zone, _) = self.get_node_zone_capacity(); + + //We compute the optimal number of partition to assign to + //every node and zone. + if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions() { + //We collect part_per_zone in a vec to not rely on the + //arbitrary order in which elements are iterated in + //Hashmap::iter() + let part_per_zone_vec = part_per_zone + .iter() + .map(|(x, y)| (x.clone(), *y)) + .collect::>(); + //We create an indexing of the zones + let mut zone_id = HashMap::::new(); + for i in 0..part_per_zone_vec.len() { + zone_id.insert(part_per_zone_vec[i].0.clone(), i); + } - /// This function calculates a new partition-to-node assignation. - /// The computed assignation maximizes the capacity of a - /// partition (assuming all partitions have the same size). - /// Among such optimal assignation, it minimizes the distance to - /// the former assignation (if any) to minimize the amount of - /// data to be moved. A heuristic ensures node triplets - /// dispersion (in garage_util::bipartite::optimize_matching()). - pub fn calculate_partition_assignation(&mut self) -> bool { - - //The nodes might have been updated, some might have been deleted. - //So we need to first update the list of nodes and retrieve the - //assignation. - let old_node_assignation = self.update_nodes_and_ring(); - - let (node_zone, _) = self.get_node_zone_capacity(); - - //We compute the optimal number of partition to assign to - //every node and zone. - if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions(){ - //We collect part_per_zone in a vec to not rely on the - //arbitrary order in which elements are iterated in - //Hashmap::iter() - let part_per_zone_vec = part_per_zone.iter() - .map(|(x,y)| (x.clone(),*y)) - .collect::>(); - //We create an indexing of the zones - let mut zone_id = HashMap::::new(); - for i in 0..part_per_zone_vec.len(){ - zone_id.insert(part_per_zone_vec[i].0.clone(), i); - } - - //We compute a candidate for the new partition to zone - //assignation. - let nb_zones = part_per_zone.len(); - let nb_nodes = part_per_nod.len(); - let nb_partitions = 1<> = - old_node_assignation.iter().map(|x| x.iter().map( - |id| match *id { Some(i) => zone_id[&node_zone[i]] , - None => no_zone } - ).collect()).collect(); - - //We minimize the distance to the former zone assignation - zone_assignation = optimize_matching( - &old_zone_assignation, &zone_assignation, nb_zones+1); //+1 for no_zone - - //We need to assign partitions to nodes in their zone - //We first put the nodes assignation that can stay the same - for i in 0..nb_partitions{ - for j in 0..self.replication_factor { - if let Some(Some(former_node)) = old_node_assignation[i].iter().find( - |x| if let Some(id) = x { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - } - else {false} - ) - { - if part_per_nod[*former_node] > 0 { - node_assignation[i][j] = Some(*former_node); - part_per_nod[*former_node] -= 1; - } - } - } - } - - - //We complete the assignation of partitions to nodes - let mut rng = rand::thread_rng(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if node_assignation[i][j] == None { - let possible_nodes : Vec = (0..nb_nodes) - .filter( - |id| zone_id[&node_zone[*id]] == zone_assignation[i][j] - && part_per_nod[*id] > 0).collect(); - assert!(possible_nodes.len()>0); - //We randomly pick a node - if let Some(nod) = possible_nodes.choose(&mut rng){ - node_assignation[i][j] = Some(*nod); - part_per_nod[*nod] -= 1; - } - } - } - } - - //We write the assignation in the 1D table - self.ring_assignation_data = Vec::::new(); - for i in 0..nb_partitions{ - for j in 0..self.replication_factor { - if let Some(id) = node_assignation[i][j] { - self.ring_assignation_data.push(id as CompactNodeType); - } - else {assert!(false)} - } - } - - true - } - else { false } - } - - /// The LwwMap of node roles might have changed. This function updates the node_id_vec - /// and returns the assignation given by ring, with the new indices of the nodes, and - /// None of the node is not present anymore. - /// We work with the assumption that only this function and calculate_new_assignation - /// do modify assignation_ring and node_id_vec. - fn update_nodes_and_ring(&mut self) -> Vec>> { - let nb_partitions = 1usize< = self.roles.items().iter() - .map(|(k, _, _)| *k) - .collect(); - - if ring.len() == rf*nb_partitions { - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - node_assignation[i][j] = new_node_id_vec.iter() - .position(|id| *id == self.node_id_vec[ring[i*rf + j] as usize]); - } - } - } - - self.node_id_vec = new_node_id_vec; - self.ring_assignation_data = vec![]; - return node_assignation; - } - - ///This function compute the number of partition to assign to - ///every node and zone, so that every partition is replicated - ///self.replication_factor times and the capacity of a partition - ///is maximized. - fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { - - let mut zone_capacity :HashMap= HashMap::new(); - - let (node_zone, node_capacity) = self.get_node_zone_capacity(); - let nb_nodes = self.node_id_vec.len(); - - for i in 0..nb_nodes - { - if zone_capacity.contains_key(&node_zone[i]) { - zone_capacity.insert(node_zone[i].clone(), zone_capacity[&node_zone[i]] + node_capacity[i]); - } - else{ - zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); - } - } - - //Compute the optimal number of partitions per zone - let sum_capacities: u32 =zone_capacity.values().sum(); - - if sum_capacities <= 0 { - println!("No storage capacity in the network."); - return None; - } - - let nb_partitions = 1< = - zone_capacity.iter() - .map(|(k, v)| (k.clone(), min(nb_partitions, - (self.replication_factor*nb_partitions - **v as usize)/sum_capacities as usize) ) ).collect(); - - //The replication_factor-1 upper bounds the number of - //part_per_zones that are greater than nb_partitions - for _ in 1..self.replication_factor { - //The number of partitions that are not assignated to - //a zone that takes nb_partitions. - let sum_capleft : u32 = zone_capacity.keys() - .filter(| k | {part_per_zone[*k] < nb_partitions} ) - .map(|k| zone_capacity[k]).sum(); - - //The number of replication of the data that we need - //to ensure. - let repl_left = self.replication_factor - - part_per_zone.values() - .filter(|x| {**x == nb_partitions}) - .count(); - if repl_left == 0 { - break; - } - - for k in zone_capacity.keys() { - if part_per_zone[k] != nb_partitions - { - part_per_zone.insert(k.to_string() , min(nb_partitions, - (nb_partitions*zone_capacity[k] as usize - *repl_left)/sum_capleft as usize)); - } - } - } - - //Now we divide the zone's partition share proportionally - //between their nodes. - - let mut part_per_nod : Vec = (0..nb_nodes).map( - |i| (part_per_zone[&node_zone[i]]*node_capacity[i] as usize)/zone_capacity[&node_zone[i]] as usize - ) - .collect(); - - //We must update the part_per_zone to make it correspond to - //part_per_nod (because of integer rounding) - part_per_zone = part_per_zone.iter().map(|(k,_)| - (k.clone(), 0)) - .collect(); - for i in 0..nb_nodes { - part_per_zone.insert( - node_zone[i].clone() , - part_per_zone[&node_zone[i]] + part_per_nod[i]); - } - - //Because of integer rounding, the total sum of part_per_nod - //might not be replication_factor*nb_partitions. - // We need at most to add 1 to every non maximal value of - // part_per_nod. The capacity of a partition will be bounded - // by the minimal value of - // node_capacity_vec[i]/part_per_nod[i] - // so we try to maximize this minimal value, keeping the - // part_per_zone capped - - let discrepancy : usize = - nb_partitions*self.replication_factor - - part_per_nod.iter().sum::(); - - //We use a stupid O(N^2) algorithm. If the number of nodes - //is actually expected to be high, one should optimize this. - - for _ in 0..discrepancy { - if let Some(idmax) = (0..nb_nodes) - .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) - .max_by( |i,j| - (node_capacity[*i]*(part_per_nod[*j]+1) as u32) - .cmp(&(node_capacity[*j]*(part_per_nod[*i]+1) as u32)) - ) - { - part_per_nod[idmax] += 1; - part_per_zone.insert(node_zone[idmax].clone(),part_per_zone[&node_zone[idmax]]+1); - } - } - - //We check the algorithm consistency - - let discrepancy : usize = - nb_partitions*self.replication_factor - - part_per_nod.iter().sum::(); - assert!(discrepancy == 0); - assert!(if let Some(v) = part_per_zone.values().max() - {*v <= nb_partitions} else {false} ); - - Some((part_per_nod, part_per_zone)) - } - - - //Returns vectors of zone and capacity; indexed by the same (temporary) - //indices as node_id_vec. - fn get_node_zone_capacity(& self) -> (Vec , Vec) { - - let node_zone = self.node_id_vec.iter().map( - |id_nod| match self.node_role(id_nod) { - Some(NodeRole{zone,capacity:_,tags:_}) => zone.clone() , - _ => "".to_string() - } - ).collect(); - - let node_capacity = self.node_id_vec.iter().map( - |id_nod| match self.node_role(id_nod) { - Some(NodeRole{zone:_,capacity,tags:_}) => - if let Some(c)=capacity - {*c} - else {0}, - _ => 0 - } - ).collect(); - - (node_zone,node_capacity) - } + //We compute a candidate for the new partition to zone + //assignation. + let nb_zones = part_per_zone.len(); + let nb_nodes = part_per_nod.len(); + let nb_partitions = 1 << PARTITION_BITS; + let left_cap_vec = vec![self.replication_factor as u32; nb_partitions]; + let right_cap_vec = part_per_zone_vec.iter().map(|(_, y)| *y as u32).collect(); + let mut zone_assignation = dinic_compute_matching(left_cap_vec, right_cap_vec); + + //We create the structure for the partition-to-node assignation. + let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; + //We will decrement part_per_nod to keep track of the number + //of partitions that we still have to associate. + let mut part_per_nod = part_per_nod.clone(); + + //We minimize the distance to the former assignation(if any) + + //We get the id of the zones of the former assignation + //(and the id no_zone if there is no node assignated) + let no_zone = part_per_zone_vec.len(); + let old_zone_assignation: Vec> = old_node_assignation + .iter() + .map(|x| { + x.iter() + .map(|id| match *id { + Some(i) => zone_id[&node_zone[i]], + None => no_zone, + }) + .collect() + }) + .collect(); + + //We minimize the distance to the former zone assignation + zone_assignation = + optimize_matching(&old_zone_assignation, &zone_assignation, nb_zones + 1); //+1 for no_zone + + //We need to assign partitions to nodes in their zone + //We first put the nodes assignation that can stay the same + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if let Some(Some(former_node)) = old_node_assignation[i].iter().find(|x| { + if let Some(id) = x { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + } else { + false + } + }) { + if part_per_nod[*former_node] > 0 { + node_assignation[i][j] = Some(*former_node); + part_per_nod[*former_node] -= 1; + } + } + } + } -} + //We complete the assignation of partitions to nodes + let mut rng = rand::thread_rng(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if node_assignation[i][j] == None { + let possible_nodes: Vec = (0..nb_nodes) + .filter(|id| { + zone_id[&node_zone[*id]] == zone_assignation[i][j] + && part_per_nod[*id] > 0 + }) + .collect(); + assert!(possible_nodes.len() > 0); + //We randomly pick a node + if let Some(nod) = possible_nodes.choose(&mut rng) { + node_assignation[i][j] = Some(*nod); + part_per_nod[*nod] -= 1; + } + } + } + } + + //We write the assignation in the 1D table + self.ring_assignation_data = Vec::::new(); + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + if let Some(id) = node_assignation[i][j] { + self.ring_assignation_data.push(id as CompactNodeType); + } else { + assert!(false) + } + } + } + true + } else { + false + } + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignation given by ring, with the new indices of the nodes, and + /// None of the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignation + /// do modify assignation_ring and node_id_vec. + fn update_nodes_and_ring(&mut self) -> Vec>> { + let nb_partitions = 1usize << PARTITION_BITS; + let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; + let rf = self.replication_factor; + let ring = &self.ring_assignation_data; + + let new_node_id_vec: Vec = self.roles.items().iter().map(|(k, _, _)| *k).collect(); + + if ring.len() == rf * nb_partitions { + for i in 0..nb_partitions { + for j in 0..self.replication_factor { + node_assignation[i][j] = new_node_id_vec + .iter() + .position(|id| *id == self.node_id_vec[ring[i * rf + j] as usize]); + } + } + } + + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = vec![]; + return node_assignation; + } + + ///This function compute the number of partition to assign to + ///every node and zone, so that every partition is replicated + ///self.replication_factor times and the capacity of a partition + ///is maximized. + fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { + let mut zone_capacity: HashMap = HashMap::new(); + + let (node_zone, node_capacity) = self.get_node_zone_capacity(); + let nb_nodes = self.node_id_vec.len(); + + for i in 0..nb_nodes { + if zone_capacity.contains_key(&node_zone[i]) { + zone_capacity.insert( + node_zone[i].clone(), + zone_capacity[&node_zone[i]] + node_capacity[i], + ); + } else { + zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); + } + } + + //Compute the optimal number of partitions per zone + let sum_capacities: u32 = zone_capacity.values().sum(); + + if sum_capacities <= 0 { + println!("No storage capacity in the network."); + return None; + } + + let nb_partitions = 1 << PARTITION_BITS; + + //Initially we would like to use zones porportionally to + //their capacity. + //However, a large zone can be associated to at most + //nb_partitions to ensure replication of the date. + //So we take the min with nb_partitions: + let mut part_per_zone: HashMap = zone_capacity + .iter() + .map(|(k, v)| { + ( + k.clone(), + min( + nb_partitions, + (self.replication_factor * nb_partitions * *v as usize) + / sum_capacities as usize, + ), + ) + }) + .collect(); + + //The replication_factor-1 upper bounds the number of + //part_per_zones that are greater than nb_partitions + for _ in 1..self.replication_factor { + //The number of partitions that are not assignated to + //a zone that takes nb_partitions. + let sum_capleft: u32 = zone_capacity + .keys() + .filter(|k| part_per_zone[*k] < nb_partitions) + .map(|k| zone_capacity[k]) + .sum(); + + //The number of replication of the data that we need + //to ensure. + let repl_left = self.replication_factor + - part_per_zone + .values() + .filter(|x| **x == nb_partitions) + .count(); + if repl_left == 0 { + break; + } + + for k in zone_capacity.keys() { + if part_per_zone[k] != nb_partitions { + part_per_zone.insert( + k.to_string(), + min( + nb_partitions, + (nb_partitions * zone_capacity[k] as usize * repl_left) + / sum_capleft as usize, + ), + ); + } + } + } + + //Now we divide the zone's partition share proportionally + //between their nodes. + + let mut part_per_nod: Vec = (0..nb_nodes) + .map(|i| { + (part_per_zone[&node_zone[i]] * node_capacity[i] as usize) + / zone_capacity[&node_zone[i]] as usize + }) + .collect(); + + //We must update the part_per_zone to make it correspond to + //part_per_nod (because of integer rounding) + part_per_zone = part_per_zone.iter().map(|(k, _)| (k.clone(), 0)).collect(); + for i in 0..nb_nodes { + part_per_zone.insert( + node_zone[i].clone(), + part_per_zone[&node_zone[i]] + part_per_nod[i], + ); + } + + //Because of integer rounding, the total sum of part_per_nod + //might not be replication_factor*nb_partitions. + // We need at most to add 1 to every non maximal value of + // part_per_nod. The capacity of a partition will be bounded + // by the minimal value of + // node_capacity_vec[i]/part_per_nod[i] + // so we try to maximize this minimal value, keeping the + // part_per_zone capped + + let discrepancy: usize = + nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); + + //We use a stupid O(N^2) algorithm. If the number of nodes + //is actually expected to be high, one should optimize this. + + for _ in 0..discrepancy { + if let Some(idmax) = (0..nb_nodes) + .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) + .max_by(|i, j| { + (node_capacity[*i] * (part_per_nod[*j] + 1) as u32) + .cmp(&(node_capacity[*j] * (part_per_nod[*i] + 1) as u32)) + }) { + part_per_nod[idmax] += 1; + part_per_zone.insert( + node_zone[idmax].clone(), + part_per_zone[&node_zone[idmax]] + 1, + ); + } + } + //We check the algorithm consistency + + let discrepancy: usize = + nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); + assert!(discrepancy == 0); + assert!(if let Some(v) = part_per_zone.values().max() { + *v <= nb_partitions + } else { + false + }); + + Some((part_per_nod, part_per_zone)) + } + + //Returns vectors of zone and capacity; indexed by the same (temporary) + //indices as node_id_vec. + fn get_node_zone_capacity(&self) -> (Vec, Vec) { + let node_zone = self + .node_id_vec + .iter() + .map(|id_nod| match self.node_role(id_nod) { + Some(NodeRole { + zone, + capacity: _, + tags: _, + }) => zone.clone(), + _ => "".to_string(), + }) + .collect(); + + let node_capacity = self + .node_id_vec + .iter() + .map(|id_nod| match self.node_role(id_nod) { + Some(NodeRole { + zone: _, + capacity, + tags: _, + }) => { + if let Some(c) = capacity { + *c + } else { + 0 + } + } + _ => 0, + }) + .collect(); + + (node_zone, node_capacity) + } +} #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; - - fn check_assignation(cl : &ClusterLayout) { - - //Check that input data has the right format - let nb_partitions = 1usize<>(); - - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec.iter().map( |z| cl.ring_assignation_data.iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - ).collect::>(); - - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition - - if let Some(idmin) = (0..nb_nodes).min_by( - |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) - ){ - if let Some(idnew) = (0..nb_nodes) - .filter( |i| if let Some(p) = zone_vec.iter().position(|z| **z==node_zone[*i]) - {zone_nb_part[p] < nb_partitions } - else { false }) - .max_by( - |i,j| (node_capacity[*i]*(node_nb_part[*j]as u32+1)) - .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) - ){ - assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= - node_capacity[idnew]*node_nb_part[idmin] as u32); - } - - } - - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z ); - if let Some(idmin) = node_of_z_iter.clone().min_by( - |i,j| (node_capacity[*i]*node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j]*node_nb_part[*i] as u32)) - ){ - if let Some(idnew) = node_of_z_iter.min_by( - |i,j| (node_capacity[*i]*(node_nb_part[*j] as u32+1)) - .cmp(&(node_capacity[*j]*(node_nb_part[*i] as u32+1))) - ){ - assert!(node_capacity[idmin]*(node_nb_part[idnew] as u32+1) >= - node_capacity[idnew]*node_nb_part[idmin] as u32); - } - } - } - - } - - fn update_layout(cl : &mut ClusterLayout, node_id_vec : &Vec, - node_capacity_vec : &Vec , node_zone_vec : &Vec) { - for i in 0..node_id_vec.len(){ - if let Some(x) = FixedBytes32::try_from(&[i as u8;32]) { - cl.node_id_vec.push(x); - } - - let update = cl.roles.update_mutator(cl.node_id_vec[i] , - NodeRoleV(Some(NodeRole{ - zone : (node_zone_vec[i].to_string()), - capacity : (Some(node_capacity_vec[i])), - tags : (vec![])}))); - cl.roles.merge(&update); - } - } - - #[test] - fn test_assignation() { - - let mut node_id_vec = vec![1,2,3]; - let mut node_capacity_vec = vec![4000,1000,2000]; - let mut node_zone_vec= vec!["A", "B", "C"].into_iter().map(|x| x.to_string()).collect(); - - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles : LwwMap::new(), - - replication_factor: 3, - ring_assignation_data : vec![], - version:0, - staging: LwwMap::new(), - staging_hash: sha256sum(&[1;32]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - node_id_vec = vec![1,2,3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000,1000,1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec= vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"].into_iter().map(|x| x.to_string()).collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - node_capacity_vec = vec![4000,1000,2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - - node_capacity_vec = vec![4000,4000,2000, 7000, 1000, 9000, 2000, 10, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); - - } -} + use super::*; + use itertools::Itertools; + + fn check_assignation(cl: &ClusterLayout) { + //Check that input data has the right format + let nb_partitions = 1usize << PARTITION_BITS; + assert!([1, 2, 3].contains(&cl.replication_factor)); + assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); + + let (node_zone, node_capacity) = cl.get_node_zone_capacity(); + + //Check that is is a correct assignation with zone redundancy + let rf = cl.replication_factor; + for i in 0..nb_partitions { + assert!( + rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] + .iter() + .map(|nod| node_zone[*nod as usize].clone()) + .unique() + .count() + ); + } + + let nb_nodes = cl.node_id_vec.len(); + //Check optimality + let node_nb_part = (0..nb_nodes) + .map(|i| { + cl.ring_assignation_data + .iter() + .filter(|x| **x == i as u8) + .count() + }) + .collect::>(); + + let zone_vec = node_zone.iter().unique().collect::>(); + let zone_nb_part = zone_vec + .iter() + .map(|z| { + cl.ring_assignation_data + .iter() + .filter(|x| node_zone[**x as usize] == **z) + .count() + }) + .collect::>(); + + //Check optimality of the zone assignation : would it be better for the + //node_capacity/node_partitions ratio to change the assignation of a partition + + if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { + (node_capacity[*i] * node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) + }) { + if let Some(idnew) = (0..nb_nodes) + .filter(|i| { + if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { + zone_nb_part[p] < nb_partitions + } else { + false + } + }) + .max_by(|i, j| { + (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) + .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) + }) { + assert!( + node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) + >= node_capacity[idnew] * node_nb_part[idmin] as u32 + ); + } + } + + //In every zone, check optimality of the nod assignation + for z in zone_vec { + let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); + if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { + (node_capacity[*i] * node_nb_part[*j] as u32) + .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) + }) { + if let Some(idnew) = node_of_z_iter.min_by(|i, j| { + (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) + .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) + }) { + assert!( + node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) + >= node_capacity[idnew] * node_nb_part[idmin] as u32 + ); + } + } + } + } + + fn update_layout( + cl: &mut ClusterLayout, + node_id_vec: &Vec, + node_capacity_vec: &Vec, + node_zone_vec: &Vec, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); + } + let update = cl.roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.roles.merge(&update); + } + } + + #[test] + fn test_assignation() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = ClusterLayout { + node_id_vec: vec![], + roles: LwwMap::new(), + replication_factor: 3, + ring_assignation_data: vec![], + version: 0, + staging: LwwMap::new(), + staging_hash: sha256sum(&[1; 32]), + }; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + + node_capacity_vec = vec![4000, 4000, 2000, 7000, 1000, 9000, 2000, 10, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); + cl.calculate_partition_assignation(); + check_assignation(&cl); + } +} diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs index aec7b042..ade831a4 100644 --- a/src/util/bipartite.rs +++ b/src/util/bipartite.rs @@ -1,378 +1,376 @@ /* - * This module deals with graph algorithm in complete bipartite + * This module deals with graph algorithm in complete bipartite * graphs. It is used in layout.rs to build the partition to node * assignation. * */ -use std::cmp::{min,max}; -use std::collections::VecDeque; use rand::prelude::SliceRandom; +use std::cmp::{max, min}; +use std::collections::VecDeque; //Graph data structure for the flow algorithm. -#[derive(Clone,Copy,Debug)] -struct EdgeFlow{ - c : i32, - flow : i32, - v : usize, - rev : usize, +#[derive(Clone, Copy, Debug)] +struct EdgeFlow { + c: i32, + flow: i32, + v: usize, + rev: usize, } //Graph data structure for the detection of positive cycles. -#[derive(Clone,Copy,Debug)] -struct WeightedEdge{ - w : i32, - u : usize, - v : usize, +#[derive(Clone, Copy, Debug)] +struct WeightedEdge { + w: i32, + u: usize, + v: usize, } - -/* This function takes two matchings (old_match and new_match) in a - * complete bipartite graph. It returns a matching that has the +/* This function takes two matchings (old_match and new_match) in a + * complete bipartite graph. It returns a matching that has the * same degree as new_match at every vertex, and that is as close * as possible to old_match. * */ -pub fn optimize_matching( old_match : &Vec> , - new_match : &Vec> , - nb_right : usize ) - -> Vec> { - let nb_left = old_match.len(); - let ed = WeightedEdge{w:-1,u:0,v:0}; - let mut edge_vec = vec![ed ; nb_left*nb_right]; - - //We build the complete bipartite graph structure, represented - //by the list of all edges. - for i in 0..nb_left { - for j in 0..nb_right{ - edge_vec[i*nb_right + j].u = i; - edge_vec[i*nb_right + j].v = nb_left+j; - } - } - - for i in 0..edge_vec.len() { - //We add the old matchings - if old_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { - edge_vec[i].w *= -1; - } - //We add the new matchings - if new_match[edge_vec[i].u].contains(&(edge_vec[i].v-nb_left)) { - (edge_vec[i].u,edge_vec[i].v) = - (edge_vec[i].v,edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - //Now edge_vec is a graph where edges are oriented LR if we - //can add them to new_match, and RL otherwise. If - //adding/removing them makes the matching closer to old_match - //they have weight 1; and -1 otherwise. - - //We shuffle the edge list so that there is no bias depending in - //partitions/zone label in the triplet dispersion - let mut rng = rand::thread_rng(); - edge_vec.shuffle(&mut rng); - - //Discovering and flipping a cycle with positive weight in this - //graph will make the matching closer to old_match. - //We use Bellman Ford algorithm to discover positive cycles - loop{ - if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { - for i in cycle { - //We flip the edges of the cycle. - (edge_vec[i].u,edge_vec[i].v) = - (edge_vec[i].v,edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - else { - //If there is no cycle, we return the optimal matching. - break; - } - } - - //The optimal matching is build from the graph structure. - let mut matching = vec![Vec::::new() ; nb_left]; - for e in edge_vec { - if e.u > e.v { - matching[e.v].push(e.u-nb_left); - } - } - matching +pub fn optimize_matching( + old_match: &Vec>, + new_match: &Vec>, + nb_right: usize, +) -> Vec> { + let nb_left = old_match.len(); + let ed = WeightedEdge { w: -1, u: 0, v: 0 }; + let mut edge_vec = vec![ed; nb_left * nb_right]; + + //We build the complete bipartite graph structure, represented + //by the list of all edges. + for i in 0..nb_left { + for j in 0..nb_right { + edge_vec[i * nb_right + j].u = i; + edge_vec[i * nb_right + j].v = nb_left + j; + } + } + + for i in 0..edge_vec.len() { + //We add the old matchings + if old_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { + edge_vec[i].w *= -1; + } + //We add the new matchings + if new_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { + (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); + edge_vec[i].w *= -1; + } + } + //Now edge_vec is a graph where edges are oriented LR if we + //can add them to new_match, and RL otherwise. If + //adding/removing them makes the matching closer to old_match + //they have weight 1; and -1 otherwise. + + //We shuffle the edge list so that there is no bias depending in + //partitions/zone label in the triplet dispersion + let mut rng = rand::thread_rng(); + edge_vec.shuffle(&mut rng); + + //Discovering and flipping a cycle with positive weight in this + //graph will make the matching closer to old_match. + //We use Bellman Ford algorithm to discover positive cycles + loop { + if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { + for i in cycle { + //We flip the edges of the cycle. + (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); + edge_vec[i].w *= -1; + } + } else { + //If there is no cycle, we return the optimal matching. + break; + } + } + + //The optimal matching is build from the graph structure. + let mut matching = vec![Vec::::new(); nb_left]; + for e in edge_vec { + if e.u > e.v { + matching[e.v].push(e.u - nb_left); + } + } + matching } //This function finds a positive cycle in a bipartite wieghted graph. -fn positive_cycle( edge_vec : &Vec, nb_left : usize, - nb_right : usize) -> Option> { - let nb_side_min = min(nb_left, nb_right); - let nb_vertices = nb_left+nb_right; - let weight_lowerbound = -((nb_left +nb_right) as i32) -1; - let mut accessed = vec![false ; nb_left]; - - //We try to find a positive cycle accessible from the left - //vertex i. - for i in 0..nb_left{ - if accessed[i] { - continue; - } - let mut weight =vec![weight_lowerbound ; nb_vertices]; - let mut prev =vec![ edge_vec.len() ; nb_vertices]; - weight[i] = 0; - //We compute largest weighted paths from i. - //Since the graph is bipartite, any simple cycle has length - //at most 2*nb_side_min. In the general Bellman-Ford - //algorithm, the bound here is the number of vertices. Since - //the number of partitions can be much larger than the - //number of nodes, we optimize that. - for _ in 0..(2*nb_side_min) { - for j in 0..edge_vec.len() { - let e = edge_vec[j]; - if weight[e.v] < weight[e.u]+e.w { - weight[e.v] = weight[e.u]+e.w; - prev[e.v] = j; - } - } - } - //We update the accessed table - for i in 0..nb_left { - if weight[i] > weight_lowerbound { - accessed[i] = true; - } - } - //We detect positive cycle - for e in edge_vec { - if weight[e.v] < weight[e.u]+e.w { - //it means e is on a path branching from a positive cycle - let mut was_seen = vec![false ; nb_vertices]; - let mut curr = e.u; - //We track back with prev until we reach the cycle. - while !was_seen[curr]{ - was_seen[curr] = true; - curr = edge_vec[prev[curr]].u; - } - //Now curr is on the cycle. We collect the edges ids. - let mut cycle = Vec::::new(); - cycle.push(prev[curr]); - let mut cycle_vert = edge_vec[prev[curr]].u; - while cycle_vert != curr { - cycle.push(prev[cycle_vert]); - cycle_vert = edge_vec[prev[cycle_vert]].u; - } - - return Some(cycle); - } - } - } - - None -} +fn positive_cycle( + edge_vec: &Vec, + nb_left: usize, + nb_right: usize, +) -> Option> { + let nb_side_min = min(nb_left, nb_right); + let nb_vertices = nb_left + nb_right; + let weight_lowerbound = -((nb_left + nb_right) as i32) - 1; + let mut accessed = vec![false; nb_left]; + //We try to find a positive cycle accessible from the left + //vertex i. + for i in 0..nb_left { + if accessed[i] { + continue; + } + let mut weight = vec![weight_lowerbound; nb_vertices]; + let mut prev = vec![edge_vec.len(); nb_vertices]; + weight[i] = 0; + //We compute largest weighted paths from i. + //Since the graph is bipartite, any simple cycle has length + //at most 2*nb_side_min. In the general Bellman-Ford + //algorithm, the bound here is the number of vertices. Since + //the number of partitions can be much larger than the + //number of nodes, we optimize that. + for _ in 0..(2 * nb_side_min) { + for j in 0..edge_vec.len() { + let e = edge_vec[j]; + if weight[e.v] < weight[e.u] + e.w { + weight[e.v] = weight[e.u] + e.w; + prev[e.v] = j; + } + } + } + //We update the accessed table + for i in 0..nb_left { + if weight[i] > weight_lowerbound { + accessed[i] = true; + } + } + //We detect positive cycle + for e in edge_vec { + if weight[e.v] < weight[e.u] + e.w { + //it means e is on a path branching from a positive cycle + let mut was_seen = vec![false; nb_vertices]; + let mut curr = e.u; + //We track back with prev until we reach the cycle. + while !was_seen[curr] { + was_seen[curr] = true; + curr = edge_vec[prev[curr]].u; + } + //Now curr is on the cycle. We collect the edges ids. + let mut cycle = Vec::::new(); + cycle.push(prev[curr]); + let mut cycle_vert = edge_vec[prev[curr]].u; + while cycle_vert != curr { + cycle.push(prev[cycle_vert]); + cycle_vert = edge_vec[prev[cycle_vert]].u; + } -// This function takes two arrays of capacity and computes the -// maximal matching in the complete bipartite graph such that the + return Some(cycle); + } + } + } + + None +} + +// This function takes two arrays of capacity and computes the +// maximal matching in the complete bipartite graph such that the // left vertex i is matched to left_cap_vec[i] right vertices, and // the right vertex j is matched to right_cap_vec[j] left vertices. // To do so, we use Dinic's maximum flow algorithm. -pub fn dinic_compute_matching( left_cap_vec : Vec, - right_cap_vec : Vec) -> Vec< Vec > -{ - let mut graph = Vec:: >::new(); - let ed = EdgeFlow{c:0,flow:0,v:0, rev:0}; - - // 0 will be the source - graph.push(vec![ed ; left_cap_vec.len()]); - for i in 0..left_cap_vec.len() - { - graph[0][i].c = left_cap_vec[i] as i32; - graph[0][i].v = i+2; - graph[0][i].rev = 0; - } - - //1 will be the sink - graph.push(vec![ed ; right_cap_vec.len()]); - for i in 0..right_cap_vec.len() - { - graph[1][i].c = right_cap_vec[i] as i32; - graph[1][i].v = i+2+left_cap_vec.len(); - graph[1][i].rev = 0; - } - - //we add left vertices - for i in 0..left_cap_vec.len() { - graph.push(vec![ed ; 1+right_cap_vec.len()]); - graph[i+2][0].c = 0; //directed - graph[i+2][0].v = 0; - graph[i+2][0].rev = i; - - for j in 0..right_cap_vec.len() { - graph[i+2][j+1].c = 1; - graph[i+2][j+1].v = 2+left_cap_vec.len()+j; - graph[i+2][j+1].rev = i+1; - } - } - - //we add right vertices - for i in 0..right_cap_vec.len() { - let lft_ln = left_cap_vec.len(); - graph.push(vec![ed ; 1+lft_ln]); - graph[i+lft_ln+2][0].c = graph[1][i].c; - graph[i+lft_ln+2][0].v = 1; - graph[i+lft_ln+2][0].rev = i; - - for j in 0..left_cap_vec.len() { - graph[i+2+lft_ln][j+1].c = 0; //directed - graph[i+2+lft_ln][j+1].v = j+2; - graph[i+2+lft_ln][j+1].rev = i+1; - } - } - - //To ensure the dispersion of the triplets generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //left vertices do not consider the right ones in the same order. - let mut rng = rand::thread_rng(); - for i in 0..graph.len() { - graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. - for j in 0..graph[i].len() { - let target_v = graph[i][j].v; - let target_rev = graph[i][j].rev; - graph[target_v][target_rev].rev = j; - } - } - - let nb_vertices = graph.len(); - - //We run Dinic's max flow algorithm - loop{ - //We build the level array from Dinic's algorithm. - let mut level = vec![-1; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((0,0)); - while !fifo.is_empty() { - if let Some((id,lvl)) = fifo.pop_front(){ - if level[id] == -1 { - level[id] = lvl; - for e in graph[id].iter(){ - if e.c-e.flow > 0{ - fifo.push_back((e.v,lvl+1)); - } - } - } - } - } - if level[1] == -1 { - //There is no residual flow - break; - } - - //Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); - - let flow_upper_bound; - if let Some(x) = left_cap_vec.iter().max() { - flow_upper_bound=*x as i32; - } - else { - flow_upper_bound = 0; - assert!(false); - } - - lifo.push_back((0,flow_upper_bound)); - - loop - { - if let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == 1 { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id,_)) = lifo.pop_back(){ - let nbd=next_nbd[id]; - graph[id][nbd].flow += f; - let id_v = graph[id][nbd].v; - let nbd_v = graph[id][nbd].rev; - graph[id_v][nbd_v].flow -= f; - } - } - lifo.push_back((0,flow_upper_bound)); - continue; - } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back(){ - next_nbd[*parent] +=1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f,graph[id][nbd].c - - graph[id][nbd].flow); - if level[graph[id][nbd].v] <= level[id] || - new_flow == 0 { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - //otherwise, we send flow to nbd. - lifo.push_back((graph[id][nbd].v, new_flow)); - } - else { - break; - } - } - } - - //We return the association - let assoc_table = (0..left_cap_vec.len()).map( - |id| graph[id+2].iter() - .filter(|e| e.flow > 0) - .map( |e| e.v-2-left_cap_vec.len()) - .collect()).collect(); - - //consistency check - - //it is a flow - for i in 3..graph.len(){ - assert!( graph[i].iter().map(|e| e.flow).sum::() == 0); - for e in graph[i].iter(){ - assert!(e.flow + graph[e.v][e.rev].flow == 0); - } - } - - //it solves the matching problem - for i in 0..left_cap_vec.len(){ - assert!(left_cap_vec[i] as i32 == - graph[i+2].iter().map(|e| max(0,e.flow)).sum::()); - } - for i in 0..right_cap_vec.len(){ - assert!(right_cap_vec[i] as i32 == - graph[i+2+left_cap_vec.len()].iter() - .map(|e| max(0,e.flow)).sum::()); - } - - - assoc_table -} +pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) -> Vec> { + let mut graph = Vec::>::new(); + let ed = EdgeFlow { + c: 0, + flow: 0, + v: 0, + rev: 0, + }; + + // 0 will be the source + graph.push(vec![ed; left_cap_vec.len()]); + for i in 0..left_cap_vec.len() { + graph[0][i].c = left_cap_vec[i] as i32; + graph[0][i].v = i + 2; + graph[0][i].rev = 0; + } + + //1 will be the sink + graph.push(vec![ed; right_cap_vec.len()]); + for i in 0..right_cap_vec.len() { + graph[1][i].c = right_cap_vec[i] as i32; + graph[1][i].v = i + 2 + left_cap_vec.len(); + graph[1][i].rev = 0; + } + + //we add left vertices + for i in 0..left_cap_vec.len() { + graph.push(vec![ed; 1 + right_cap_vec.len()]); + graph[i + 2][0].c = 0; //directed + graph[i + 2][0].v = 0; + graph[i + 2][0].rev = i; + + for j in 0..right_cap_vec.len() { + graph[i + 2][j + 1].c = 1; + graph[i + 2][j + 1].v = 2 + left_cap_vec.len() + j; + graph[i + 2][j + 1].rev = i + 1; + } + } + //we add right vertices + for i in 0..right_cap_vec.len() { + let lft_ln = left_cap_vec.len(); + graph.push(vec![ed; 1 + lft_ln]); + graph[i + lft_ln + 2][0].c = graph[1][i].c; + graph[i + lft_ln + 2][0].v = 1; + graph[i + lft_ln + 2][0].rev = i; + + for j in 0..left_cap_vec.len() { + graph[i + 2 + lft_ln][j + 1].c = 0; //directed + graph[i + 2 + lft_ln][j + 1].v = j + 2; + graph[i + 2 + lft_ln][j + 1].rev = i + 1; + } + } + + //To ensure the dispersion of the triplets generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //left vertices do not consider the right ones in the same order. + let mut rng = rand::thread_rng(); + for i in 0..graph.len() { + graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..graph[i].len() { + let target_v = graph[i][j].v; + let target_rev = graph[i][j].rev; + graph[target_v][target_rev].rev = j; + } + } + + let nb_vertices = graph.len(); + + //We run Dinic's max flow algorithm + loop { + //We build the level array from Dinic's algorithm. + let mut level = vec![-1; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((0, 0)); + while !fifo.is_empty() { + if let Some((id, lvl)) = fifo.pop_front() { + if level[id] == -1 { + level[id] = lvl; + for e in graph[id].iter() { + if e.c - e.flow > 0 { + fifo.push_back((e.v, lvl + 1)); + } + } + } + } + } + if level[1] == -1 { + //There is no residual flow + break; + } + + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + let flow_upper_bound; + if let Some(x) = left_cap_vec.iter().max() { + flow_upper_bound = *x as i32; + } else { + flow_upper_bound = 0; + assert!(false); + } + + lifo.push_back((0, flow_upper_bound)); + + loop { + if let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == 1 { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + graph[id][nbd].flow += f; + let id_v = graph[id][nbd].v; + let nbd_v = graph[id][nbd].rev; + graph[id_v][nbd_v].flow -= f; + } + } + lifo.push_back((0, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); + if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + //otherwise, we send flow to nbd. + lifo.push_back((graph[id][nbd].v, new_flow)); + } else { + break; + } + } + } + + //We return the association + let assoc_table = (0..left_cap_vec.len()) + .map(|id| { + graph[id + 2] + .iter() + .filter(|e| e.flow > 0) + .map(|e| e.v - 2 - left_cap_vec.len()) + .collect() + }) + .collect(); + + //consistency check + + //it is a flow + for i in 3..graph.len() { + assert!(graph[i].iter().map(|e| e.flow).sum::() == 0); + for e in graph[i].iter() { + assert!(e.flow + graph[e.v][e.rev].flow == 0); + } + } + + //it solves the matching problem + for i in 0..left_cap_vec.len() { + assert!(left_cap_vec[i] as i32 == graph[i + 2].iter().map(|e| max(0, e.flow)).sum::()); + } + for i in 0..right_cap_vec.len() { + assert!( + right_cap_vec[i] as i32 + == graph[i + 2 + left_cap_vec.len()] + .iter() + .map(|e| max(0, e.flow)) + .sum::() + ); + } + + assoc_table +} #[cfg(test)] mod tests { - use super::*; - - #[test] - fn test_flow() { - let left_vec = vec![3;8]; - let right_vec = vec![0,4,8,4,8]; - //There are asserts in the function that computes the flow - let _ = dinic_compute_matching(left_vec, right_vec); - } - - //maybe add tests relative to the matching optilization ? -} + use super::*; + #[test] + fn test_flow() { + let left_vec = vec![3; 8]; + let right_vec = vec![0, 4, 8, 4, 8]; + //There are asserts in the function that computes the flow + let _ = dinic_compute_matching(left_vec, right_vec); + } + //maybe add tests relative to the matching optilization ? +} -- cgit v1.2.3 From 3ba2c5b4246d1063e433cb349aba2ac40c376654 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 1 May 2022 10:11:43 +0200 Subject: updated cargo.lock --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index f61e2506..1a9eac10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1028,6 +1028,7 @@ dependencies = [ "gethostname", "hex", "hyper", + "itertools 0.10.3", "k8s-openapi", "kube", "kuska-sodiumoxide", -- cgit v1.2.3 From 948ff93cf10da1705766c2f0d256c316adcb806b Mon Sep 17 00:00:00 2001 From: Mendes Date: Sun, 1 May 2022 16:05:39 +0200 Subject: Corrected the warnings and errors issued by cargo clippy --- src/rpc/layout.rs | 26 +++++------ src/util/bipartite.rs | 117 ++++++++++++++++++++++---------------------------- 2 files changed, 63 insertions(+), 80 deletions(-) diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index ac31da72..d0ee3463 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -195,8 +195,8 @@ impl ClusterLayout { .collect::>(); //We create an indexing of the zones let mut zone_id = HashMap::::new(); - for i in 0..part_per_zone_vec.len() { - zone_id.insert(part_per_zone_vec[i].0.clone(), i); + for (i, ppz) in part_per_zone_vec.iter().enumerate() { + zone_id.insert(ppz.0.clone(), i); } //We compute a candidate for the new partition to zone @@ -212,7 +212,7 @@ impl ClusterLayout { let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; //We will decrement part_per_nod to keep track of the number //of partitions that we still have to associate. - let mut part_per_nod = part_per_nod.clone(); + let mut part_per_nod = part_per_nod; //We minimize the distance to the former assignation(if any) @@ -265,7 +265,7 @@ impl ClusterLayout { && part_per_nod[*id] > 0 }) .collect(); - assert!(possible_nodes.len() > 0); + assert!(!possible_nodes.is_empty()); //We randomly pick a node if let Some(nod) = possible_nodes.choose(&mut rng) { node_assignation[i][j] = Some(*nod); @@ -277,12 +277,12 @@ impl ClusterLayout { //We write the assignation in the 1D table self.ring_assignation_data = Vec::::new(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if let Some(id) = node_assignation[i][j] { + for ass in node_assignation { + for nod in ass { + if let Some(id) = nod { self.ring_assignation_data.push(id as CompactNodeType); } else { - assert!(false) + panic!() } } } @@ -318,7 +318,7 @@ impl ClusterLayout { self.node_id_vec = new_node_id_vec; self.ring_assignation_data = vec![]; - return node_assignation; + node_assignation } ///This function compute the number of partition to assign to @@ -345,7 +345,7 @@ impl ClusterLayout { //Compute the optimal number of partitions per zone let sum_capacities: u32 = zone_capacity.values().sum(); - if sum_capacities <= 0 { + if sum_capacities == 0 { println!("No storage capacity in the network."); return None; } @@ -493,14 +493,10 @@ impl ClusterLayout { .map(|id_nod| match self.node_role(id_nod) { Some(NodeRole { zone: _, - capacity, + capacity: Some(c), tags: _, }) => { - if let Some(c) = capacity { *c - } else { - 0 - } } _ => 0, }) diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs index ade831a4..1e1e9caa 100644 --- a/src/util/bipartite.rs +++ b/src/util/bipartite.rs @@ -31,8 +31,8 @@ struct WeightedEdge { * as possible to old_match. * */ pub fn optimize_matching( - old_match: &Vec>, - new_match: &Vec>, + old_match: &[Vec], + new_match: &[Vec], nb_right: usize, ) -> Vec> { let nb_left = old_match.len(); @@ -72,16 +72,11 @@ pub fn optimize_matching( //Discovering and flipping a cycle with positive weight in this //graph will make the matching closer to old_match. //We use Bellman Ford algorithm to discover positive cycles - loop { - if let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { - for i in cycle { - //We flip the edges of the cycle. - (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); - edge_vec[i].w *= -1; - } - } else { - //If there is no cycle, we return the optimal matching. - break; + while let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { + for i in cycle { + //We flip the edges of the cycle. + (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); + edge_vec[i].w *= -1; } } @@ -97,7 +92,7 @@ pub fn optimize_matching( //This function finds a positive cycle in a bipartite wieghted graph. fn positive_cycle( - edge_vec: &Vec, + edge_vec: &[WeightedEdge], nb_left: usize, nb_right: usize, ) -> Option> { @@ -122,8 +117,7 @@ fn positive_cycle( //the number of partitions can be much larger than the //number of nodes, we optimize that. for _ in 0..(2 * nb_side_min) { - for j in 0..edge_vec.len() { - let e = edge_vec[j]; + for (j, e) in edge_vec.iter().enumerate() { if weight[e.v] < weight[e.u] + e.w { weight[e.v] = weight[e.u] + e.w; prev[e.v] = j; @@ -148,8 +142,7 @@ fn positive_cycle( curr = edge_vec[prev[curr]].u; } //Now curr is on the cycle. We collect the edges ids. - let mut cycle = Vec::::new(); - cycle.push(prev[curr]); + let mut cycle = vec![prev[curr]]; let mut cycle_vert = edge_vec[prev[curr]].u; while cycle_vert != curr { cycle.push(prev[cycle_vert]); @@ -180,16 +173,16 @@ pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) - // 0 will be the source graph.push(vec![ed; left_cap_vec.len()]); - for i in 0..left_cap_vec.len() { - graph[0][i].c = left_cap_vec[i] as i32; + for (i, c) in left_cap_vec.iter().enumerate() { + graph[0][i].c = *c as i32; graph[0][i].v = i + 2; graph[0][i].rev = 0; } //1 will be the sink graph.push(vec![ed; right_cap_vec.len()]); - for i in 0..right_cap_vec.len() { - graph[1][i].c = right_cap_vec[i] as i32; + for (i, c) in right_cap_vec.iter().enumerate() { + graph[1][i].c = *c as i32; graph[1][i].v = i + 2 + left_cap_vec.len(); graph[1][i].rev = 0; } @@ -267,58 +260,52 @@ pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) - let mut next_nbd = vec![0; nb_vertices]; let mut lifo = VecDeque::new(); - let flow_upper_bound; - if let Some(x) = left_cap_vec.iter().max() { - flow_upper_bound = *x as i32; + let flow_upper_bound = if let Some(x) = left_cap_vec.iter().max() { + *x as i32 } else { - flow_upper_bound = 0; - assert!(false); - } + panic!(); + }; lifo.push_back((0, flow_upper_bound)); - loop { - if let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == 1 { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - graph[id][nbd].flow += f; - let id_v = graph[id][nbd].v; - let nbd_v = graph[id][nbd].rev; - graph[id_v][nbd_v].flow -= f; - } + while let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == 1 { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + graph[id][nbd].flow += f; + let id_v = graph[id][nbd].v; + let nbd_v = graph[id][nbd].rev; + graph[id_v][nbd_v].flow -= f; } - lifo.push_back((0, flow_upper_bound)); - continue; } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { - next_nbd[*parent] += 1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); - if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; + lifo.push_back((0, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; } - //otherwise, we send flow to nbd. - lifo.push_back((graph[id][nbd].v, new_flow)); - } else { - break; + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); + if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; } + //otherwise, we send flow to nbd. + lifo.push_back((graph[id][nbd].v, new_flow)); } } -- cgit v1.2.3 From 617f28bfa466d52fac7244f08b3a036ab4e8c9af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 5 May 2022 14:21:57 +0200 Subject: Correct small formatting issue --- src/rpc/layout.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d0ee3463..40f97368 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -495,9 +495,7 @@ impl ClusterLayout { zone: _, capacity: Some(c), tags: _, - }) => { - *c - } + }) => *c, _ => 0, }) .collect(); -- cgit v1.2.3 From 03e3a1bd153bf7c3bf13216964fb17463a26aaae Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 18 Jul 2022 22:35:29 +0200 Subject: Added the latex report on the optimal layout algorithm --- doc/optimal_layout_report/figures/flow.pdf | Bin 0 -> 12947 bytes doc/optimal_layout_report/figures/flow.svg | 2205 +++++++++++ doc/optimal_layout_report/figures/mini_node.pdf | Bin 0 -> 18288 bytes doc/optimal_layout_report/figures/mini_node.svg | 3962 ++++++++++++++++++++ doc/optimal_layout_report/figures/mini_zone.pdf | Bin 0 -> 7446 bytes doc/optimal_layout_report/figures/mini_zone.svg | 1562 ++++++++ doc/optimal_layout_report/figures/naive.pdf | Bin 0 -> 18347 bytes doc/optimal_layout_report/figures/naive.svg | 3899 +++++++++++++++++++ doc/optimal_layout_report/optimal_layout.aux | 32 + doc/optimal_layout_report/optimal_layout.log | 303 ++ doc/optimal_layout_report/optimal_layout.pdf | Bin 0 -> 279062 bytes .../optimal_layout.synctex.gz | Bin 0 -> 84542 bytes doc/optimal_layout_report/optimal_layout.tex | 394 ++ 13 files changed, 12357 insertions(+) create mode 100644 doc/optimal_layout_report/figures/flow.pdf create mode 100644 doc/optimal_layout_report/figures/flow.svg create mode 100644 doc/optimal_layout_report/figures/mini_node.pdf create mode 100644 doc/optimal_layout_report/figures/mini_node.svg create mode 100644 doc/optimal_layout_report/figures/mini_zone.pdf create mode 100644 doc/optimal_layout_report/figures/mini_zone.svg create mode 100644 doc/optimal_layout_report/figures/naive.pdf create mode 100644 doc/optimal_layout_report/figures/naive.svg create mode 100644 doc/optimal_layout_report/optimal_layout.aux create mode 100644 doc/optimal_layout_report/optimal_layout.log create mode 100644 doc/optimal_layout_report/optimal_layout.pdf create mode 100644 doc/optimal_layout_report/optimal_layout.synctex.gz create mode 100644 doc/optimal_layout_report/optimal_layout.tex diff --git a/doc/optimal_layout_report/figures/flow.pdf b/doc/optimal_layout_report/figures/flow.pdf new file mode 100644 index 00000000..3546ad0a Binary files /dev/null and b/doc/optimal_layout_report/figures/flow.pdf differ diff --git a/doc/optimal_layout_report/figures/flow.svg b/doc/optimal_layout_report/figures/flow.svg new file mode 100644 index 00000000..e370755e --- /dev/null +++ b/doc/optimal_layout_report/figures/flow.svg @@ -0,0 +1,2205 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/optimal_layout_report/figures/mini_node.pdf b/doc/optimal_layout_report/figures/mini_node.pdf new file mode 100644 index 00000000..6df8a5b2 Binary files /dev/null and b/doc/optimal_layout_report/figures/mini_node.pdf differ diff --git a/doc/optimal_layout_report/figures/mini_node.svg b/doc/optimal_layout_report/figures/mini_node.svg new file mode 100644 index 00000000..b044b0cd --- /dev/null +++ b/doc/optimal_layout_report/figures/mini_node.svg @@ -0,0 +1,3962 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/optimal_layout_report/figures/mini_zone.pdf b/doc/optimal_layout_report/figures/mini_zone.pdf new file mode 100644 index 00000000..36085c52 Binary files /dev/null and b/doc/optimal_layout_report/figures/mini_zone.pdf differ diff --git a/doc/optimal_layout_report/figures/mini_zone.svg b/doc/optimal_layout_report/figures/mini_zone.svg new file mode 100644 index 00000000..5c505539 --- /dev/null +++ b/doc/optimal_layout_report/figures/mini_zone.svg @@ -0,0 +1,1562 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/optimal_layout_report/figures/naive.pdf b/doc/optimal_layout_report/figures/naive.pdf new file mode 100644 index 00000000..f32e4273 Binary files /dev/null and b/doc/optimal_layout_report/figures/naive.pdf differ diff --git a/doc/optimal_layout_report/figures/naive.svg b/doc/optimal_layout_report/figures/naive.svg new file mode 100644 index 00000000..0a40c45f --- /dev/null +++ b/doc/optimal_layout_report/figures/naive.svg @@ -0,0 +1,3899 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/optimal_layout_report/optimal_layout.aux b/doc/optimal_layout_report/optimal_layout.aux new file mode 100644 index 00000000..fe8b0891 --- /dev/null +++ b/doc/optimal_layout_report/optimal_layout.aux @@ -0,0 +1,32 @@ +\relax +\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Context}{1}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Formal description of the problem}{1}{}\protected@file@percent } +\newlabel{eq:optimal}{{{OPT}}{1}} +\@writefile{toc}{\contentsline {section}{\numberline {2}Properties of an optimal 3-strict assignment}{2}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Optimal assignment}{2}{}\protected@file@percent } +\newlabel{sec:opt_assign}{{2.1}{2}} +\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces On the left, the creation of a concrete assignment with the naive approach of repeating tokens. On the right, the zones containing the nodes.}}{4}{}\protected@file@percent } +\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Flow problem to compute and optimal assignment.}}{4}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Minimal transfer}{5}{}\protected@file@percent } +\newlabel{hyp:A}{{{H3A}}{5}} +\newlabel{hyp:B}{{{H3B}}{5}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {A)}Minimizing the zone discrepancy}{6}{}\protected@file@percent } +\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces On the left: the graph $G_T$ encoding an assignment to minimize the zone discrepancy. On the right: the graph $G_T$ encoding an assignment to minimize the node discrepancy.}}{7}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {B)}Minimizing the node discrepancy}{8}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {C)}Linear combination of both criteria}{8}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Algorithm}{9}{}\protected@file@percent } +\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Optimal 3-strict assignment}}{9}{}\protected@file@percent } +\newlabel{alg:total}{{1}{9}} +\@writefile{toc}{\contentsline {section}{\numberline {3}TODO}{9}{}\protected@file@percent } +\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Computation of the optimal utilization}}{10}{}\protected@file@percent } +\newlabel{alg:util}{{2}{10}} +\newlabel{lin:subutil}{{2}{10}} +\newlabel{lin:loopsub}{{3}{10}} +\newlabel{lin:findmin}{{4}{10}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Computation of a candidate assignment}}{10}{}\protected@file@percent } +\newlabel{alg:opt}{{3}{10}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Minimization of the number of transfers}}{11}{}\protected@file@percent } +\newlabel{alg:mini}{{4}{11}} +\newlabel{lin:repeat}{{3}{11}} +\gdef \@abspage@last{11} diff --git a/doc/optimal_layout_report/optimal_layout.log b/doc/optimal_layout_report/optimal_layout.log new file mode 100644 index 00000000..c73818ff --- /dev/null +++ b/doc/optimal_layout_report/optimal_layout.log @@ -0,0 +1,303 @@ +This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020/Debian) (preloaded format=pdflatex 2022.6.23) 18 JUL 2022 22:33 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**optimal_layout.tex +(./optimal_layout.tex +LaTeX2e <2020-10-01> patch level 4 +L3 programming layer <2021-01-09> xparse <2020-03-03> +(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls +Document Class: article 2020/04/10 v1.4m Standard LaTeX document class +(/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo +File: size10.clo 2020/04/10 v1.4m Standard LaTeX file (size option) +) +\c@part=\count177 +\c@section=\count178 +\c@subsection=\count179 +\c@subsubsection=\count180 +\c@paragraph=\count181 +\c@subparagraph=\count182 +\c@figure=\count183 +\c@table=\count184 +\abovecaptionskip=\skip47 +\belowcaptionskip=\skip48 +\bibindent=\dimen138 +) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsmath.sty +Package: amsmath 2020/09/23 v2.17i AMS math features +\@mathmargin=\skip49 + +For additional information on amsmath, use the `?' option. +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amstext.sty +Package: amstext 2000/06/29 v2.01 AMS text + +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsgen.sty +File: amsgen.sty 1999/11/30 v2.0 generic functions +\@emptytoks=\toks15 +\ex@=\dimen139 +)) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsbsy.sty +Package: amsbsy 1999/11/29 v1.2d Bold Symbols +\pmbraise@=\dimen140 +) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsopn.sty +Package: amsopn 2016/03/08 v2.02 operator names +) +\inf@bad=\count185 +LaTeX Info: Redefining \frac on input line 234. +\uproot@=\count186 +\leftroot@=\count187 +LaTeX Info: Redefining \overline on input line 399. +\classnum@=\count188 +\DOTSCASE@=\count189 +LaTeX Info: Redefining \ldots on input line 496. +LaTeX Info: Redefining \dots on input line 499. +LaTeX Info: Redefining \cdots on input line 620. +\Mathstrutbox@=\box47 +\strutbox@=\box48 +\big@size=\dimen141 +LaTeX Font Info: Redeclaring font encoding OML on input line 743. +LaTeX Font Info: Redeclaring font encoding OMS on input line 744. +\macc@depth=\count190 +\c@MaxMatrixCols=\count191 +\dotsspace@=\muskip16 +\c@parentequation=\count192 +\dspbrk@lvl=\count193 +\tag@help=\toks16 +\row@=\count194 +\column@=\count195 +\maxfields@=\count196 +\andhelp@=\toks17 +\eqnshift@=\dimen142 +\alignsep@=\dimen143 +\tagshift@=\dimen144 +\tagwidth@=\dimen145 +\totwidth@=\dimen146 +\lineht@=\dimen147 +\@envbody=\toks18 +\multlinegap=\skip50 +\multlinetaggap=\skip51 +\mathdisplay@stack=\toks19 +LaTeX Info: Redefining \[ on input line 2923. +LaTeX Info: Redefining \] on input line 2924. +) +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amssymb.sty +Package: amssymb 2013/01/14 v3.01 AMS font symbols + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amsfonts.sty +Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support +\symAMSa=\mathgroup4 +\symAMSb=\mathgroup5 +LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. +LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' +(Font) U/euf/m/n --> U/euf/b/n on input line 106. +)) +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2020/09/09 v1.2b Enhanced LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 2014/10/28 v1.15 key=value parser (DPC) +\KV@toks@=\toks20 +) +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2020/08/30 v1.4c Standard LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2016/01/03 v1.10 sin cos tan (DPC) +) +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: pdftex.def on input line 105. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-def/pdftex.def +File: pdftex.def 2020/10/05 v1.2a Graphics/color driver for pdftex +)) +\Gin@req@height=\dimen148 +\Gin@req@width=\dimen149 +) +(/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty +Package: xcolor 2016/05/11 v2.12 LaTeX color extensions (UK) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package xcolor Info: Driver file: pdftex.def on input line 225. +Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1348. +Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1352. +Package xcolor Info: Model `RGB' extended on input line 1364. +Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1366. +Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1367. +Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1368. +Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1369. +Package xcolor Info: Model `Gray' substituted by `gray' on input line 1370. +Package xcolor Info: Model `wave' substituted by `hsb' on input line 1371. +) +(/usr/share/texlive/texmf-dist/tex/latex/algorithms/algorithm.sty +Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating enviro +nment + +(/usr/share/texlive/texmf-dist/tex/latex/float/float.sty +Package: float 2001/11/08 v1.3d Float enhancements (AL) +\c@float@type=\count197 +\float@exts=\toks21 +\float@box=\box49 +\@float@everytoks=\toks22 +\@floatcapt=\box50 +) +(/usr/share/texlive/texmf-dist/tex/latex/base/ifthen.sty +Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC) +) +\@float@every@algorithm=\toks23 +\c@algorithm=\count198 +) +(/usr/share/texlive/texmf-dist/tex/latex/algorithmicx/algpseudocode.sty +Package: algpseudocode + +(/usr/share/texlive/texmf-dist/tex/latex/algorithmicx/algorithmicx.sty +Package: algorithmicx 2005/04/27 v1.2 Algorithmicx + +Document Style algorithmicx 1.2 - a greatly improved `algorithmic' style +\c@ALG@line=\count199 +\c@ALG@rem=\count266 +\c@ALG@nested=\count267 +\ALG@tlm=\skip52 +\ALG@thistlm=\skip53 +\c@ALG@Lnr=\count268 +\c@ALG@blocknr=\count269 +\c@ALG@storecount=\count270 +\c@ALG@tmpcounter=\count271 +\ALG@tmplength=\skip54 +) +Document Style - pseudocode environments for use with the `algorithmicx' style +) (/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +File: l3backend-pdftex.def 2020-01-29 L3 backend support: PDF output (pdfTeX) +\l__color_backend_stack_int=\count272 +\l__pdf_internal_box=\box51 +) +(./optimal_layout.aux) +\openout1 = `optimal_layout.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 15. +LaTeX Font Info: ... okay on input line 15. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 15. +LaTeX Font Info: ... okay on input line 15. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 15. +LaTeX Font Info: ... okay on input line 15. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 15. +LaTeX Font Info: ... okay on input line 15. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 15. +LaTeX Font Info: ... okay on input line 15. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 15. +LaTeX Font Info: ... okay on input line 15. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 15. +LaTeX Font Info: ... okay on input line 15. + +(/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count273 +\scratchdimen=\dimen150 +\scratchbox=\box52 +\nofMPsegments=\count274 +\nofMParguments=\count275 +\everyMPshowfont=\toks24 +\MPscratchCnt=\count276 +\MPscratchDim=\dimen151 +\MPnumerator=\count277 +\makeMPintoPDFobject=\count278 +\everyMPtoPDFconversion=\toks25 +) (/usr/share/texlive/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf +Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4 +85. + +(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv +e +)) +LaTeX Font Info: Trying to load font information for U+msa on input line 17. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsa.fd +File: umsa.fd 2013/01/14 v3.01 AMS symbols A +) +LaTeX Font Info: Trying to load font information for U+msb on input line 17. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsb.fd +File: umsb.fd 2013/01/14 v3.01 AMS symbols B +) [1 + +{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] [2] + +File: figures/naive.pdf Graphic file (type pdf) + +Package pdftex.def Info: figures/naive.pdf used on input line 117. +(pdftex.def) Requested size: 310.4979pt x 116.6252pt. + [3] + +File: figures/flow.pdf Graphic file (type pdf) + +Package pdftex.def Info: figures/flow.pdf used on input line 136. +(pdftex.def) Requested size: 207.0021pt x 104.94873pt. + [4 <./figures/naive.pdf> <./figures/flow.pdf + +pdfTeX warning: /usr/bin/pdflatex (file ./figures/flow.pdf): PDF inclusion: mul +tiple pdfs with page group included in a single page +>] [5] + +File: figures/mini_zone.pdf Graphic file (type pdf) + +Package pdftex.def Info: figures/mini_zone.pdf used on input line 221. +(pdftex.def) Requested size: 110.39873pt x 138.8974pt. + +File: figures/mini_node.pdf Graphic file (type pdf) + +Package pdftex.def Info: figures/mini_node.pdf used on input line 225. +(pdftex.def) Requested size: 151.8014pt x 157.28752pt. + [6] +Overfull \hbox (6.52959pt too wide) in paragraph at lines 239--240 +[]\OT1/cmr/m/n/10 Assume that their ex-ist some as-sign-ment $\OML/cmm/m/it/10 +T[]$ \OT1/cmr/m/n/10 with the same uti-liza-tion $(\OML/cmm/m/it/10 n[]\OT1/cmr +/m/n/10 )[]$. + [] + +[7 <./figures/mini_zone.pdf> <./figures/mini_node.pdf + +pdfTeX warning: /usr/bin/pdflatex (file ./figures/mini_node.pdf): PDF inclusion +: multiple pdfs with page group included in a single page +>] [8] [9] [10] [11] (./optimal_layout.aux) ) +Here is how much of TeX's memory you used: + 3544 strings out of 481176 + 47263 string characters out of 5914226 + 339215 words of memory out of 5000000 + 20458 multiletter control sequences out of 15000+600000 + 413592 words of font info for 65 fonts, out of 8000000 for 9000 + 59 hyphenation exceptions out of 8191 + 68i,12n,74p,880b,308s stack positions out of 5000i,500n,10000p,200000b,80000s + +Output written on optimal_layout.pdf (11 pages, 279062 bytes). +PDF statistics: + 127 PDF objects out of 1000 (max. 8388607) + 90 compressed objects within 1 object stream + 0 named destinations out of 1000 (max. 500000) + 21 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/doc/optimal_layout_report/optimal_layout.pdf b/doc/optimal_layout_report/optimal_layout.pdf new file mode 100644 index 00000000..84265135 Binary files /dev/null and b/doc/optimal_layout_report/optimal_layout.pdf differ diff --git a/doc/optimal_layout_report/optimal_layout.synctex.gz b/doc/optimal_layout_report/optimal_layout.synctex.gz new file mode 100644 index 00000000..376399c7 Binary files /dev/null and b/doc/optimal_layout_report/optimal_layout.synctex.gz differ diff --git a/doc/optimal_layout_report/optimal_layout.tex b/doc/optimal_layout_report/optimal_layout.tex new file mode 100644 index 00000000..843e0be6 --- /dev/null +++ b/doc/optimal_layout_report/optimal_layout.tex @@ -0,0 +1,394 @@ +\documentclass[]{article} + +\usepackage{amsmath,amssymb} + +\usepackage{graphicx,xcolor} + +\usepackage{algorithm,algpseudocode,float} + +\renewcommand\thesubsubsection{\Alph{subsubsection})} + +%opening +\title{Optimal partition assignment in Garage} +\author{Mendes Oulamara} + +\begin{document} + +\maketitle + +\section{Introduction} + +\subsection{Context} + +Garage is an open-source distributed storage service blablabla$\dots$ + +Every object to be stored in the system falls in a partition given by the last $k$ bits of its hash. There are $N=2^k$ partitions. Every partition will be stored on distinct nodes of the system. The goal of the assignment of partitions to nodes is to ensure (nodes and zone) redundancy and to be as efficient as possible. + +\subsection{Formal description of the problem} + +We are given a set of nodes $V$ and a set of zones $Z$. Every node $v$ has a non-negative storage capacity $c_v\ge 0$ and belongs to a zone $z_v\in Z$. We are also given a number of partition $N>0$ (typically $N=256$). + +We would like to compute an assignment of three nodes to every partition. That is, for every $1\le i\le N$, we compute a triplet of three distinct nodes $T_i=(T_i^1, T_i^2, T_i^3) \in V^3$. We will impose some redundancy constraints to this assignment, and under these constraints, we want our system to have the largest storage capacity possible. To link storage capacity to partition assignment, we make the following assumption: +\begin{equation} + \tag{H1} + \text{\emph{All partitions have the same size $s$.}} +\end{equation} +This assumption is justified by the dispersion of the hashing function, when the number of partitions is small relative to the number of stored large objects. + +Every node $v$ needs to store $n_v = \#\{ 1\le i\le N ~|~ v\in T_i \}$ partitions (where $\#$ denots the number of indices in the set). Hence the partitions stored by $v$ (and hence all partitions by our assumption) have there size bounded by $c_v/n_v$. This remark leads us to define the optimal size that we will want to maximize: + +\begin{equation} + \label{eq:optimal} + \tag{OPT} +s^* = \min_{v \in V} \frac{c_v}{n_v}. +\end{equation} + +When the capacities of the nodes are updated (this includes adding or removing a node), we want to update the assignment as well. However, transferring the data between nodes has a cost and we would like to limit the number of changes in the assignment. We make the following assumption: +\begin{equation} + \tag{H2} + \text{\emph{Updates of capacity happens rarely relatively to object storing.}} +\end{equation} +This assumption justifies that when we compute the new assignment, it is worth to optimize the partition size \eqref{eq:optimal} first, and then, among the possible optimal solution, to try to minimize the number of partition transfers. + +For now, in the following, we ask the following redundancy constraint: + +\textbf{Mode 3-strict:} every partition needs to be assignated to three nodes belonging to three different zones. + +\section{Properties of an optimal 3-strict assignment} + +\subsection{Optimal assignment} +\label{sec:opt_assign} + +For every zone $z\in Z$, define the zone capacity $c_z = \sum_{v, z_v=z} c_v$ and define $C = \sum_v c_v = \sum_z c_z$. + +One can check that the best we could be doing to maximize $s^*$ would be to use the nodes proportionally to their capacity. This would yield $s^*=C/(3N)$. This is not possible because of (i) redundancy constraints and (ii) integer rounding but it gives and upper bound. + +\subsubsection*{Optimal utilization} + +We call an \emph{utilization} a collection of non-negative integers $(n_v)_{v\in V}$ such that $\sum_v n_v = 3N$ and for every zone $z$, $\sum_{v\in z} n_v \le N$. We call such utilization \emph{optimal} if it maximizes $s^*$. + +We start by computing a node sub-utilization $(\hat{n}_v)_{v\in V}$ such that for every zone $z$, $\sum_{v\in z} \hat{n}_v \le N$ and we show that there is an optimal utilization respecting the constraints and such that $\hat{n}_v \le n_v$ for every node. + +Assume that there is a zone $z_0$ such that $c_{z_0}/C \ge 1/3$. Then for any $v\in z_0$, we define +$$\hat{n}_v = \left\lfloor\frac{c_v}{c_{z_0}}N\right\rfloor.$$ +This choice ensures for any such $v$ that +$$ +\frac{c_v}{\hat{n}_v} \ge \frac{c_{z_0}}{N} \ge \frac{C}{3N} +$$ +which is the universal upper bound on $s^*$. Hence any optimal utilization $(n_v)$ can be modified to another optimal utilization such that $n_v\ge \hat{n}_v$ + +Because $z_0$ cannot store more than $N$ partition occurences, in any assignment, at least $2N$ partitions must be assignated to the zones $Z\setminus\{z_0\}$. Let $C_0 = C-c_{z_0}$. Suppose that there exists a zone $z_1\neq z_0$ such that $c_{z_1}/C_0 \ge 1/2$. Then, with the same argument as for $z_0$, we can define +$$\hat{n}_v = \left\lfloor\frac{c_v}{c_{z_1}}N\right\rfloor$$ +for every $v\in z_1$. + +Now we can assign the remaining partitions. Let $(\hat{N}, \hat{C})$ to be +\begin{itemize} + \item $(3N,C)$ if we did not find any $z_0$; + \item $(2N,C-c_{z_0})$ if there was a $z_0$ but no $z_1$; + \item $(N,C-c_{z_0}-c_{z_1})$ if there was a $z_0$ and a $z_1$. +\end{itemize} +Then at least $\hat{N}$ partitions must be spread among the remaining zones. Hence $s^*$ is upper bounded by $\hat{C}/\hat{N}$ and without loss of generality, we can define, for every node that is not in $z_0$ nor $z_1$, +$$\hat{n}_v = \left\lfloor\frac{c_v}{\hat{C}}\hat{N}\right\rfloor.$$ + +We constructed a sub-utilization $\hat{n}_v$. Now notice that $3N-\sum_v \hat{n}_v \le \# V$ where $\# V$ denotes the number of nodes. We can iteratively pick a node $v^*$ such that +\begin{itemize} + \item $\sum_{v\in z_{v^*}} \hat{n}_v < N$ where $z_{v^*}$ is the zone of $v^*$; + \item $v^*$ maximizes the quantity $c_v/(\hat{n}_v+1)$ among the vertices satisfying the first condition (i.e. not in a saturated zone). +\end{itemize} +We iterate these instructions until $\sum_v \hat{n}_v= 3N$, and at this stage we define $(n_v) = (\hat{n}_v)$. It is easy to prove by induction that at every step, there is an optimal utilization that is pointwise larger than $\hat{n}_v$, and in particular, that $(n_v)$ is optimal. + +\subsubsection*{Existence of an optimal assignment} + +As for now, the \emph{optimal utilization} that we obtained is just a vector of numbers and it is not clear that it can be realized as the utilization of some concrete assignment. Here is a way to get a concrete assignment. + +Define $3N$ tokens $t_1,\ldots, t_{3N}\in V$ as follows: +\begin{itemize} + \item Enumerate the zones $z$ of $Z$ in any order; + \item enumerate the nodes $v$ of $z$ in any order; + \item repeat $n_v$ times the token $v$. +\end{itemize} +Then for $1\le i \le N$, define the triplet $T_i$ to be +$(t_i, t_{i+N}, t_{i+2N})$. Since the same nodes of a zone appear contiguously, the three nodes of a triplet must belong to three distinct zones. + +However simple, this solution to go from an utilization to an assignment has the drawback of not spreading the triplets: a node will tend to be associated to the same two other nodes for many partitions. Hence, during data transfer, it will tend to use only two link, instead of spreading the bandwith use over many other links to other nodes. To achieve this goal, we will reframe the search of an assignment as a flow problem. and in the flow algorithm, we will introduce randomness in the order of exploration. This will be sufficient to obtain a good dispersion of the triplets. + +\begin{figure} + \centering + \includegraphics[width=0.9\linewidth]{figures/naive} + \caption{On the left, the creation of a concrete assignment with the naive approach of repeating tokens. On the right, the zones containing the nodes.} +\end{figure} + +\subsubsection*{Assignment as a maximum flow problem} + +We describe the flow problem via its graph $(X,E)$ where $X$ is a set of vertices, and $E$ are directed weighted edges between the vertices. For every zone $z$, define $n_z=\sum_{v\in z} n_v$. + +The set of vertices $X$ contains the source $\mathbf{s}$ and the sink $\mathbf{t}$; a vertex $\mathbf{x}_z$ for every zone $z\in Z$, and a vertex $\mathbf{y}_i$ for every partition index $1\le i\le N$. + +The set of edges $E$ contains +\begin{itemize} + \item the edge $(\mathbf{s}, \mathbf{x}_z, n_z)$ for every zone $z\in Z$; + \item the edge $(\mathbf{x}_z, \mathbf{y}_i, 1)$ for every zone $z\in Z$ and partition $1\le i\le N$; + \item the edge $(\mathbf{y}_i, \mathbf{t}, 3)$ for every partition $1\le i\le N$. +\end{itemize} + +\begin{figure}[b] + \centering + \includegraphics[width=0.6\linewidth]{figures/flow} + \caption{Flow problem to compute and optimal assignment.} +\end{figure} + +We first show the equivalence between this problem and and the construction of an assignment. Given some optimal assignment $(n_v)$, define the flow $f:E\to \mathbb{N}$ that saturates every edge from $\mathbf{s}$ or to $\mathbf{t}$, takes value $1$ on the edge between $\mathbf{x}_z$ and $\mathbf{y}_i$ if partition $i$ is stored in some node of the zone $z$, and $0$ otherwise. One can easily check that $f$ thus defined is indeed a flow and is maximum. + +Reciprocally, by the existence of maximum flows constructed from optimal assignments, any maximum flow must saturate the edges linked to the source or the sink. It can only take value 0 or 1 on the other edge, and every partition vertex is associated to exactly three distinct zone vertices. Every zone is associated to exactly $n_z$ partitions. + +A maximum flow can be constructed using, for instance, Dinic's algorithm. This algorithm works by discovering augmenting path to iteratively increase the flow. During the exploration of the graph to find augmenting path, we can shuffle the order of enumeration of the neighbours to spread the associations between zones and partitions. + +Once we have such association, we can randomly distribute the $n_z$ edges picked for every zone $z$ to its nodes $v\in z$ such that every such $v$ gets $n_z$ edges. This defines an optimal assignment of partitions to nodes. + + +\subsection{Minimal transfer} + +Assume that there was a previous assignment $(T'_i)_{1\le i\le N}$ corresponding to utilizations $(n'_v)_{v\in V}$. We would like the new computed assignment $(T_i)_{1\le i\le N}$ from some $(n_v)_{v\in V}$ to minimize the number of partitions that need to be transferred. We can imagine two different objectives corresponding to different hypotheses: +\begin{equation} + \tag{H3A} + \label{hyp:A} + \text{\emph{Transfers between different zones cost much more than inside a zone.}} +\end{equation} +\begin{equation} + \tag{H3B} + \label{hyp:B} + \text{\emph{Changing zone is not the largest cost when transferring a partition.}} +\end{equation} + +In case $A$, our goal will be to minimize the number of changes of zone in the assignment of partitions to zone. More formally, we will maximize the quantity +$$ +Q_Z := +\sum_{1\le i\le N} +\#\{z\in Z ~|~ z\cap T_i \neq \emptyset, z\cap T'_i \neq \emptyset \} +.$$ + +In case $B$, our goal will be to minimize the number of changes of nodes in the assignment of partitions to nodes. We will maximize the quantity +$$ +Q_V := +\sum_{1\le i\le N} \#(T_i \cap T'_i). +$$ + +It is tempting to hope that there is a way to maximize both quantity, that having the least discrepancy in terms of nodes will lead to the least discrepancy in terms of zones. But this is actually wrong! We propose the following counter-example to convince the reader: + +We consider eight nodes $a, a', b, c, d, d', e, e'$ belonging to five different zones $\{a,a'\}, \{b\}, \{c\}, \{d,d'\}, \{e, e'\}$. We take three partitions ($N=3$), that are originally assigned with some utilization $(n'_v)_{v\in V}$ as follows: +$$ +T'_1=(a,b,c) \qquad +T'_2=(a',b,d) \qquad +T'_3=(b,c,e). +$$ +This assignment, with updated utilizations $(n_v)_{v\in V}$ minimizes the number of zone changes: +$$ +T_1=(d,b,c) \qquad +T_2=(a,b,d) \qquad +T_3=(b,c,e'). +$$ +This one, with the same utilization, minimizes the number of node changes: +$$ +T_1=(a,b,c) \qquad +T_2=(e',b,d) \qquad +T_3=(b,c,d'). +$$ +One can check that in this case, it is impossible to minimize both the number of zone and node changes. + +Because of the redundancy constraint, we cannot use a greedy algorithm to just replace nodes in the triplets to try to get the new utilization rate: this could lead to blocking situation where there is still a hole to fill in a triplet but no available node satisfies the zone separation constraint. To circumvent this issue, we propose an algorithm based on finding cycles in a graph encoding of the assignment. As in section \ref{sec:opt_assign}, we can explore the neigbours in a random order in the graph algorithms, to spread the triplets distribution. + + +\subsubsection{Minimizing the zone discrepancy} + + +First, notice that, given an assignment of partitions to \emph{zones}, it is easy to deduce an assignment to \emph{nodes} that minimizes the number of transfers for this zone assignment: For every zone $z$ and every node $v\in z$, pick in any way a set $P_v$ of partitions that where assigned to $v$ in $T'$, to $z_v$ in $T$, with the cardinality of $P_v$ smaller than $n_v$. Once all these sets are chosen, complement the assignment to reach the right utilization for every node. If $\#P_v > n_v$, it means that all the partitions that could stay in $v$ (i.e. that were already in $v$ and are still assigned to its zone) do stay in $v$. If $\#P_v = n_v$, then $n_v$ partitions stay in $v$, which is the number of partitions that need to be in $v$ in the end. In both cases, we could not hope for better given the partition to zone assignment. + +Our goal now is to find a assignment of partitions to zones that minimizes the number of zone transfers. To do so we are going to represent an assignment as a graph. + +Let $G_T=(X,E_T)$ be the directed weighted graph with vertices $(\mathbf{x}_i)_{1\le i\le N}$ and $(\mathbf{y}_z)_{z\in Z}$. For any $1\le i\le N$ and $z\in Z$, $E_T$ contains the arc: +\begin{itemize} + \item $(\mathbf{x}_i, \mathbf{y}_z, +1)$, if $z$ appears in $T_i'$ and $T_i$; + \item $(\mathbf{x}_i, \mathbf{y}_z, -1)$, if $z$ appears in $T_i$ but not in $T'_i$; + \item $(\mathbf{y}_z, \mathbf{x}_i, -1)$, if $z$ appears in $T'_i$ but not in $T_i$; + \item $(\mathbf{y}_z, \mathbf{x}_i, +1)$, if $z$ does not appear in $T'_i$ nor in $T_i$. +\end{itemize} +In other words, the orientation of the arc encodes whether partition $i$ is stored in zone $z$ in the assignment $T$ and the weight $\pm 1$ encodes whether this corresponds to what happens in the assignment $T'$. + +\begin{figure}[t] + \centering + \begin{minipage}{.40\linewidth} + \centering + \includegraphics[width=.8\linewidth]{figures/mini_zone} + \end{minipage} + \begin{minipage}{.55\linewidth} + \centering + \includegraphics[width=.8\linewidth]{figures/mini_node} + \end{minipage} + \caption{On the left: the graph $G_T$ encoding an assignment to minimize the zone discrepancy. On the right: the graph $G_T$ encoding an assignment to minimize the node discrepancy.} +\end{figure} + + +Notice that at every partition, there are three outgoing arcs, and at every zone, there are $n_z$ incoming arcs. Moreover, if $w(e)$ is the weight of an arc $e$, define the weight of $G_T$ by +\begin{align*} +w(G_T) := \sum_{e\in E} w(e) &= \#Z \times N - 4 \sum_{1\le i\le N} \#\{z\in Z ~|~ z\cap T_i = \emptyset, z\cap T'_i \neq \emptyset\} \\ +&=\#Z \times N - 4 \sum_{1\le i\le N} 3- \#\{z\in Z ~|~ z\cap T_i \neq \emptyset, z\cap T'_i \neq \emptyset\} \\ +&= (\#Z-12)N + 4 Q_Z. +\end{align*} +Hence maximizing $Q_Z$ is equivalent to maximizing $w(G_T)$. + +Assume that their exist some assignment $T^*$ with the same utilization $(n_v)_{v\in V}$. Define $G_{T^*}$ similarly and consider the set $E_\mathrm{Diff} = E_T \setminus E_{T^*}$ of arcs that appear only in $G_T$. Since all vertices have the same number of incoming arcs in $G_T$ and $G_{T^*}$, the vertices of the graph $(X, E_\mathrm{Diff})$ must all have the same number number of incoming and outgoing arrows. So $E_\mathrm{Diff}$ can be expressed as a union of disjoint cycles. Moreover, the edges of $E_\mathrm{Diff}$ must appear in $E_{T^*}$ with reversed orientation and opposite weight. Hence, we have +$$ + w(G_T) - w(G_{T^*}) = 2 \sum_{e\in E_\mathrm{Diff}} w(e). +$$ +Hence, if $T$ is not optimal, there exists some $T^*$ with $w(G_T) < w(G_{T^*})$, and by the considerations above, there must exist a cycle in $E_\mathrm{Diff}$, and hence in $G_T$, with negative weight. If we reverse the edges and weights along this cycle, we obtain some graph. Since we did not change the incoming degree of any vertex, this is the graph encoding of some valid assignment $T^+$ such that $w(G_{T^+}) > w(G_T)$. We can iterate this operation until there is no other assignment $T^*$ with larger weight, that is until we obtain an optimal assignment. + + + +\subsubsection{Minimizing the node discrepancy} + +We will follow an approach similar to the one where we minimize the zone discrepancy. Here we will directly obtain a node assignment from a graph encoding. + +Let $G_T=(X,E_T)$ be the directed weighted graph with vertices $(\mathbf{x}_i)_{1\le i\le N}$, $(\mathbf{y}_{z,i})_{z\in Z, 1\le i\le N}$ and $(\mathbf{u}_v)_{v\in V}$. For any $1\le i\le N$ and $z\in Z$, $E_T$ contains the arc: +\begin{itemize} + \item $(\mathbf{x}_i, \mathbf{y}_{z,i}, 0)$, if $z$ appears in $T_i$; + \item $(\mathbf{y}_{z,i}, \mathbf{x}_i, 0)$, if $z$ does not appear in $T_i$. +\end{itemize} +For any $1\le i\le N$ and $v\in V$, $E_T$ contains the arc: +\begin{itemize} + \item $(\mathbf{y}_{z_v,i}, \mathbf{u}_v, +1)$, if $v$ appears in $T_i'$ and $T_i$; + \item $(\mathbf{y}_{z_v,i}, \mathbf{u}_v, -1)$, if $v$ appears in $T_i$ but not in $T'_i$; + \item $(\mathbf{u}_v, \mathbf{y}_{z_v,i}, -1)$, if $v$ appears in $T'_i$ but not in $T_i$; + \item $(\mathbf{u}_v, \mathbf{y}_{z_v,i}, +1)$, if $v$ does not appear in $T'_i$ nor in $T_i$. +\end{itemize} +Every vertex $\mathbb{x}_i$ has outgoing degree 3, every vertex $\mathbf{y}_{z,v}$ has outgoing degree 1, and every vertex $\mathbf{u}_v$ has incoming degree $n_v$. +Remark that any graph respecting these degree constraints is the encoding of a valid assignment with utilizations $(n_v)_{v\in V}$, in particular no partition is stored in two nodes of the same zone. + +We define $w(G_T)$ similarly: +\begin{align*} + w(G_T) := \sum_{e\in E_T} w(e) &= \#V \times N - 4\sum_{1\le i\le N} 3-\#(T_i\cap T'_i) \\ + &= (\#V-12)N + 4Q_V. +\end{align*} + +Exactly like in the previous section, the existence of an assignment with larger weight implies the existence of a negatively weighted cycle in $G_T$. Reversing this cycle gives us the encoding of a valid assignment with a larger weight. Iterating this operation yields an optimal assignment. + + +\subsubsection{Linear combination of both criteria} + +In the graph $G_T$ defined in the previous section, instead of having weights $0$ and $\pm 1$, we could be having weights $\pm\alpha$ between $\mathbf{x}$ and $\mathbf{y}$ vertices, and weights $\pm\beta$ between $\mathbf{y}$ and $\mathbf{u}$ vertices, for some $\alpha,\beta>0$ (we have positive weight if the assignment corresponds to $T'$ and negative otherwise). Then +\begin{align*} + w(G_T) &= \sum_{e\in E_T} w(e) = + \alpha \big( (\#Z-12)N + 4 Q_Z\big) + + \beta \big( (\#V-12)N + 4 Q_V\big) \\ + &= \mathrm{const}+ 4(\alpha Q_Z + \beta Q_V). +\end{align*} +So maximizing the weight of such graph encoding would be equivalent to maximizing a linear combination of $Q_Z$ and $Q_V$. + + +\subsection{Algorithm} +We give a high level description of the algorithm to compute an optimal 3-strict assignment. The operations appearing at lines 1,2,4 are respectively described by Algorithms \ref{alg:util},\ref{alg:opt} and \ref{alg:mini}. + + + +\begin{algorithm}[H] + \caption{Optimal 3-strict assignment} + \label{alg:total} + \begin{algorithmic}[1] + \Function{Optimal 3-strict assignment}{$N$, $(c_v)_{v\in V}$, $T'$} + \State $(n_v)_{v\in V} \leftarrow$ \Call{Compute optimal utilization}{$N$, $(c_v)_{v\in V}$} + \State $(T_i)_{1\le i\le N} \leftarrow$ \Call{Compute candidate assignment}{$N$, $(n_v)_{v\in V}$} + \If {there was a previous assignment $T'$} + \State $T \leftarrow$ \Call{Minimization of transfers}{$(T_i)_{1\le i\le N}$, $(T'_i)_{1\le i\le N}$} + \EndIf + \State \Return $T$. + \EndFunction + \end{algorithmic} +\end{algorithm} + +We give some considerations of worst case complexity for these algorithms. In the following, we assume $N>\#V>\#Z$. The complexity of Algorithm \ref{alg:total} is $O(N^3\# Z)$ if we assume \eqref{hyp:A} and $O(N^3 \#Z \#V)$ if we assume \eqref{hyp:B}. + +Algorithm \ref{alg:util} can be implemented with complexity $O(\#V^2)$. The complexity of the function call at line \ref{lin:subutil} is $O(\#V)$. The difference between the sum of the subutilizations and $3N$ is at most the sum of the rounding errors when computing the $\hat{n}_v$. Hence it is bounded by $\#V$ and the loop at line \ref{lin:loopsub} is iterated at most $\#V$ times. Finding the minimizing $v$ at line \ref{lin:findmin} takes $O(\#V)$ operations (naively, we could also use a heap). + +Algorithm \ref{alg:opt} can be implemented with complexity $O(N^3\times \#Z)$. The flow graph has $O(N+\#Z)$ vertices and $O(N\times \#Z)$ edges. Dinic's algorithm has complexity $O(\#\mathrm{Vertices}^2\#\mathrm{Edges})$ hence in our case it is $O(N^3\times \#Z)$. + +Algorithm \ref{alg:mini} can be implented with complexity $O(N^3\# Z)$ under \eqref{hyp:A} and $O(N^3 \#Z \#V)$ under \eqref{hyp:B}. +The graph $G_T$ has $O(N)$ vertices and $O(N\times \#Z)$ edges under assumption \eqref{hyp:A} and respectively $O(N\times \#Z)$ vertices and $O(N\times \#V)$ edges under assumption \eqref{hyp:B}. The loop at line \ref{lin:repeat} is iterated at most $N$ times since the distance between $T$ and $T'$ decreases at every iteration. Bellman-Ford algorithm has complexity $O(\#\mathrm{Vertices}\#\mathrm{Edges})$, which in our case amounts to $O(N^2\# Z)$ under \eqref{hyp:A} and $O(N^2 \#Z \#V)$ under \eqref{hyp:B}. + +\begin{algorithm} + \caption{Computation of the optimal utilization} + \label{alg:util} + \begin{algorithmic}[1] +\Function{Compute optimal utilization}{$N$, $(c_v)_{v\in V}$} + \State $(\hat{n}_v)_{v\in V} \leftarrow $ \Call{Compute subutilization}{$N$, $(c_v)_{v\in V}$} \label{lin:subutil} + \While{$\sum_{v\in V} \hat{n}_v < 3N$} \label{lin:loopsub} + \State Pick $v\in V$ minimizing $\frac{c_v}{\hat{n}_v+1}$ and such that + $\sum_{v'\in z_v} \hat{n}_{v'} < N$ \label{lin:findmin} + \State $\hat{n}_v \leftarrow \hat{n}_v+1$ + \EndWhile + \State \Return $(\hat{n}_v)_{v\in V}$ +\EndFunction +\State + +\Function{Compute subutilization}{$N$, $(c_v)_{v\in V}$} + \State $R \leftarrow 3$ +\For{$v\in V$} +\State $\hat{n}_v \leftarrow \mathrm{unset}$ +\EndFor +\For{$z\in Z$} +\State $c_z \leftarrow \sum_{v\in z} c_v$ +\EndFor +\State $C \leftarrow \sum_{z\in Z} c_z$ +\While{$\exists z \in Z$ such that $R\times c_{z} > C$} +\For{$v\in z$} +\State $\hat{n}_v \leftarrow \left\lfloor \frac{c_v}{c_z} N \right\rfloor$ +\EndFor +\State $C \leftarrow C-c_z$ +\State $R\leftarrow R-1$ +\EndWhile +\For{$v\in V$} +\If{$\hat{n}_v = \mathrm{unset}$} +\State $\hat{n}_v \leftarrow \left\lfloor \frac{Rc_v}{C} N \right\rfloor$ +\EndIf +\EndFor +\State \Return $(\hat{n}_v)_{v\in V}$ +\EndFunction + \end{algorithmic} +\end{algorithm} + +\begin{algorithm} + \caption{Computation of a candidate assignment} + \label{alg:opt} + \begin{algorithmic}[1] + \Function{Compute candidate assignment}{$N$, $(n_v)_{v\in V}$} + \State Compute the flow graph $G$ + \State Compute the maximal flow $f$ using Dinic's algorithm with randomized neighbours enumeration + \State Construct the assignment $(T_i)_{1\le i\le N}$ from $f$ + \State \Return $(T_i)_{1\le i\le N}$ + \EndFunction + \end{algorithmic} +\end{algorithm} + + +\begin{algorithm} + \caption{Minimization of the number of transfers} + \label{alg:mini} + \begin{algorithmic}[1] + \Function{Minimization of transfers}{$(T_i)_{1\le i\le N}$, $(T'_i)_{1\le i\le N}$} + \State Construct the graph encoding $G_T$ + \Repeat \label{lin:repeat} + \State Find a negative cycle $\gamma$ using Bellman-Ford algorithm on $G_T$ + \State Reverse the orientations and weights of edges in $\gamma$ + \Until{no negative cycle is found} + \State Update $(T_i)_{1\le i\le N}$ from $G_T$ + \State \Return $(T_i)_{1\le i\le N}$ + \EndFunction + \end{algorithmic} +\end{algorithm} + + + +\section{TODO} + +- reunion deux fleurs : autres modes, autres contraintes + +\end{document} + -- cgit v1.2.3 From 81083dd415664d9c2d35e52eba13826b952c38e6 Mon Sep 17 00:00:00 2001 From: Mendes Date: Fri, 19 Aug 2022 21:21:41 +0200 Subject: Added a first draft version of the algorithm and analysis for the non-strict mode. --- doc/optimal_layout_report/optimal_layout.aux | 15 ++-- doc/optimal_layout_report/optimal_layout.log | 77 ++++++++++----------- doc/optimal_layout_report/optimal_layout.pdf | Bin 279062 -> 289460 bytes .../optimal_layout.synctex.gz | Bin 84542 -> 107678 bytes doc/optimal_layout_report/optimal_layout.tex | 77 ++++++++++++++++++++- 5 files changed, 120 insertions(+), 49 deletions(-) diff --git a/doc/optimal_layout_report/optimal_layout.aux b/doc/optimal_layout_report/optimal_layout.aux index fe8b0891..9e779514 100644 --- a/doc/optimal_layout_report/optimal_layout.aux +++ b/doc/optimal_layout_report/optimal_layout.aux @@ -7,26 +7,29 @@ \@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Optimal assignment}{2}{}\protected@file@percent } \newlabel{sec:opt_assign}{{2.1}{2}} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces On the left, the creation of a concrete assignment with the naive approach of repeating tokens. On the right, the zones containing the nodes.}}{4}{}\protected@file@percent } -\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Flow problem to compute and optimal assignment.}}{4}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Minimal transfer}{5}{}\protected@file@percent } \newlabel{hyp:A}{{{H3A}}{5}} \newlabel{hyp:B}{{{H3B}}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Flow problem to compute and optimal assignment.}}{5}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {A)}Minimizing the zone discrepancy}{6}{}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces On the left: the graph $G_T$ encoding an assignment to minimize the zone discrepancy. On the right: the graph $G_T$ encoding an assignment to minimize the node discrepancy.}}{7}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {B)}Minimizing the node discrepancy}{8}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {C)}Linear combination of both criteria}{8}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsubsection}{\numberline {C)}Linear combination of both criteria}{9}{}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Algorithm}{9}{}\protected@file@percent } \@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Optimal 3-strict assignment}}{9}{}\protected@file@percent } \newlabel{alg:total}{{1}{9}} -\@writefile{toc}{\contentsline {section}{\numberline {3}TODO}{9}{}\protected@file@percent } \@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Computation of the optimal utilization}}{10}{}\protected@file@percent } \newlabel{alg:util}{{2}{10}} \newlabel{lin:subutil}{{2}{10}} \newlabel{lin:loopsub}{{3}{10}} \newlabel{lin:findmin}{{4}{10}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Computation of a candidate assignment}}{10}{}\protected@file@percent } -\newlabel{alg:opt}{{3}{10}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Computation of a candidate assignment}}{11}{}\protected@file@percent } +\newlabel{alg:opt}{{3}{11}} \@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Minimization of the number of transfers}}{11}{}\protected@file@percent } \newlabel{alg:mini}{{4}{11}} \newlabel{lin:repeat}{{3}{11}} -\gdef \@abspage@last{11} +\@writefile{toc}{\contentsline {section}{\numberline {3}Computation of a 3-non-strict assignment}{11}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Choices of optimality}{11}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Computation of a candidate assignment}{11}{}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Maximal spread and minimal transfers}{12}{}\protected@file@percent } +\gdef \@abspage@last{13} diff --git a/doc/optimal_layout_report/optimal_layout.log b/doc/optimal_layout_report/optimal_layout.log index c73818ff..1bce9627 100644 --- a/doc/optimal_layout_report/optimal_layout.log +++ b/doc/optimal_layout_report/optimal_layout.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020/Debian) (preloaded format=pdflatex 2022.6.23) 18 JUL 2022 22:33 +This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020/Debian) (preloaded format=pdflatex 2022.6.23) 19 AUG 2022 21:20 entering extended mode restricted \write18 enabled. %&-line parsing enabled. @@ -228,35 +228,30 @@ LaTeX Font Info: Trying to load font information for U+msb on input line 17. File: umsb.fd 2013/01/14 v3.01 AMS symbols B ) [1 -{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] [2] - +{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] [2] [3] + File: figures/naive.pdf Graphic file (type pdf) -Package pdftex.def Info: figures/naive.pdf used on input line 117. +Package pdftex.def Info: figures/naive.pdf used on input line 121. (pdftex.def) Requested size: 310.4979pt x 116.6252pt. - [3] File: figures/flow.pdf Graphic file (type pdf) -Package pdftex.def Info: figures/flow.pdf used on input line 136. +Package pdftex.def Info: figures/flow.pdf used on input line 140. (pdftex.def) Requested size: 207.0021pt x 104.94873pt. - [4 <./figures/naive.pdf> <./figures/flow.pdf - -pdfTeX warning: /usr/bin/pdflatex (file ./figures/flow.pdf): PDF inclusion: mul -tiple pdfs with page group included in a single page ->] [5] - + [4 <./figures/naive.pdf>] [5 <./figures/flow.pdf>] [6] + File: figures/mini_zone.pdf Graphic file (type pdf) -Package pdftex.def Info: figures/mini_zone.pdf used on input line 221. +Package pdftex.def Info: figures/mini_zone.pdf used on input line 225. (pdftex.def) Requested size: 110.39873pt x 138.8974pt. - + File: figures/mini_node.pdf Graphic file (type pdf) -Package pdftex.def Info: figures/mini_node.pdf used on input line 225. +Package pdftex.def Info: figures/mini_node.pdf used on input line 229. (pdftex.def) Requested size: 151.8014pt x 157.28752pt. - [6] -Overfull \hbox (6.52959pt too wide) in paragraph at lines 239--240 + +Overfull \hbox (6.52959pt too wide) in paragraph at lines 243--244 []\OT1/cmr/m/n/10 Assume that their ex-ist some as-sign-ment $\OML/cmm/m/it/10 T[]$ \OT1/cmr/m/n/10 with the same uti-liza-tion $(\OML/cmm/m/it/10 n[]\OT1/cmr /m/n/10 )[]$. @@ -266,38 +261,38 @@ T[]$ \OT1/cmr/m/n/10 with the same uti-liza-tion $(\OML/cmm/m/it/10 n[]\OT1/cmr pdfTeX warning: /usr/bin/pdflatex (file ./figures/mini_node.pdf): PDF inclusion : multiple pdfs with page group included in a single page ->] [8] [9] [10] [11] (./optimal_layout.aux) ) +>] [8] [9] [10] [11] [12] [13] (./optimal_layout.aux) ) Here is how much of TeX's memory you used: 3544 strings out of 481176 47263 string characters out of 5914226 - 339215 words of memory out of 5000000 + 336215 words of memory out of 5000000 20458 multiletter control sequences out of 15000+600000 413592 words of font info for 65 fonts, out of 8000000 for 9000 59 hyphenation exceptions out of 8191 68i,12n,74p,880b,308s stack positions out of 5000i,500n,10000p,200000b,80000s - -Output written on optimal_layout.pdf (11 pages, 279062 bytes). + < +/usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr17.pfb> +Output written on optimal_layout.pdf (13 pages, 289460 bytes). PDF statistics: - 127 PDF objects out of 1000 (max. 8388607) - 90 compressed objects within 1 object stream + 135 PDF objects out of 1000 (max. 8388607) + 96 compressed objects within 1 object stream 0 named destinations out of 1000 (max. 500000) 21 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/doc/optimal_layout_report/optimal_layout.pdf b/doc/optimal_layout_report/optimal_layout.pdf index 84265135..667798fe 100644 Binary files a/doc/optimal_layout_report/optimal_layout.pdf and b/doc/optimal_layout_report/optimal_layout.pdf differ diff --git a/doc/optimal_layout_report/optimal_layout.synctex.gz b/doc/optimal_layout_report/optimal_layout.synctex.gz index 376399c7..59241b07 100644 Binary files a/doc/optimal_layout_report/optimal_layout.synctex.gz and b/doc/optimal_layout_report/optimal_layout.synctex.gz differ diff --git a/doc/optimal_layout_report/optimal_layout.tex b/doc/optimal_layout_report/optimal_layout.tex index 843e0be6..594c7ecc 100644 --- a/doc/optimal_layout_report/optimal_layout.tex +++ b/doc/optimal_layout_report/optimal_layout.tex @@ -54,6 +54,10 @@ For now, in the following, we ask the following redundancy constraint: \textbf{Mode 3-strict:} every partition needs to be assignated to three nodes belonging to three different zones. +\textbf{Mode 3:} every partition needs to be assignated to three nodes. We try to spread the three nodes over different zones as much as possible. + +\textbf{Remark: (TODO):} The algorithms below directly adapt to a redundancy of $r$ instead of 3. + \section{Properties of an optimal 3-strict assignment} \subsection{Optimal assignment} @@ -384,11 +388,80 @@ The graph $G_T$ has $O(N)$ vertices and $O(N\times \#Z)$ edges under assumption \end{algorithmic} \end{algorithm} +\newpage + +\section{Computation of a 3-non-strict assignment} + +\subsection{Choices of optimality} + +In this mode, we primarily want to store every partition on three nodes, and only secondarily try to spread the nodes among different zone. So we make the choice of not taking the zone repartition in the criterion of optimality. + +We try to maximize $s^*$ defined in \eqref{eq:optimal}. So we can compute the optimal utilizations $(n_v)_{v\in V}$ with the only constraint that $n_v \le N$ for every node $v$. As in the previous section, we start with a sub-utilization proportional to $c_v$ (and capped at $N$), and we iteratively increase the $\hat{n}_v$ that is less than $N$ and maximizes the quantity $c_v/(\hat{n}_v+1)$, until the total sum is $3N$. + +\subsection{Computation of a candidate assignment} + +To compute a candidate assignment (that does not optimize zone spreading nor distance to a previous assignment yet), we can use the folowing flow problem. + +Define the oriented weighted graph $(X,E)$. The set of vertices $X$ contains the source $\mathbf{s}$, the sink $\mathbf{t}$, vertices +$\mathbf{x}_p, \mathbf{u}^+_p, \mathbf{u}^-_p$ for every partition $p$, vertices $\mathbf{y}_{p,z}$ for every partition $p$ and zone $z$, and vertices $\mathbf{z}_v$ for every node $v$. + +The set of edges is composed of the following arcs: +\begin{itemize} + \item ($\mathbf{s}$,$\mathbf{x}_p$, 3) for every partition $p$; + \item ($\mathbf{x}_p$,$\mathbf{u}^+_p$, 3) for every partition $p$; + \item ($\mathbf{x}_p$,$\mathbf{u}^-_p$, 2) for every partition $p$; + \item ($\mathbf{u}^+_p$,$\mathbf{y}_{p,z}$, 1) for every partition $p$ and zone $z$; + \item ($\mathbf{u}^-_p$,$\mathbf{y}_{p,z}$, 2) for every partition $p$ and zone $z$; + \item ($\mathbf{y}_{p,z}$,$\mathbf{z}_v$, 1) for every partition $p$, zone $z$ and node $v\in z$; + \item ($\mathbf{z}_v$, $\mathbf{t}$, $n_v$) for every node $v$; +\end{itemize} + +One can check that any maximal flow in this graph corresponds to an assignment of partitions to nodes. In such a flow, all the arcs from $\mathbf{s}$ and to $\mathbf{t}$ are saturated. The arc from $\mathbf{y}_{p,z}$ to $\mathbf{z}_v$ is saturated if and only if $p$ is associated to~$v$. +Finally the flow from $\mathbf{x}_p$ to $\mathbf{y}_{p,z}$ can go either through $\mathbf{u}^+_p$ or $\mathbf{u}^-_p$. -\section{TODO} -- reunion deux fleurs : autres modes, autres contraintes +\subsection{Maximal spread and minimal transfers} +Notice that if the arc $\mathbf{u}_p^+\mathbf{y}_{p,z}$ is not saturated but there is some flow in $\mathbf{u}_p^-\mathbf{y}_{p,z}$, then it is possible to transfer a unit of flow from the path $\mathbf{x}_p\mathbf{u}_p^-\mathbf{y}_{p,z}$ to the path $\mathbf{x}_p\mathbf{u}_p^+\mathbf{y}_{p,z}$. So we can always find an equivalent maximal flow $f^*$ that uses the path through $\mathbf{u}_p^-$ only if the path through $\mathbf{u}_p^+$ is saturated. + +We will use this fact to consider the amount of flow going through the vertices $\mathbf{u}^+$ as a measure of how well the partitions are spread over nodes belonging to different zones. If the partition $p$ is associated to 3 different zones, then a flow of 3 will cross $\mathbf{u}_p^+$ in $f^*$ (i.e. a flow of 0 will cross $\mathbf{u}_p^+$). If $p$ is associated to two zones, a flow of $2$ will cross $\mathbf{u}_p^+$. If $p$ is associated to a single zone, a flow of $1$ will cross $\mathbf{u}_p^+$. + +Let $N_1, N_2, N_3$ be the number of partitions associated to respectively 1,2 and 3 distinct zones. We will optimize a linear combination of these variables using the discovery of positively weighted circuits in a graph. + +At the same step, we will also optimize the distance to a previous assignment $T'$. Let $\alpha> \beta> \gamma \ge 0$ be three parameters. + +Given the flow $f$, let $G_f=(X',E_f)$ be the multi-graph where $X' = X\setminus\{\mathbf{s},\mathbf{t}\}$. The set $E_f$ is composed of the arcs: +\begin{itemize} +\item As many arcs from $(\mathbf{x}_p, \mathbf{u}^+_p,\alpha), (\mathbf{x}_p, \mathbf{u}^+_p,\beta), (\mathbf{x}_p, \mathbf{u}^+_p,\gamma)$ (selected in this order) as there is flow crossing $\mathbf{u}^+_p$ in $f$; +\item As many arcs from $(\mathbf{u}^+_p, \mathbf{x}_p,-\gamma), (\mathbf{u}^+_p, \mathbf{x}_p,-\beta), (\mathbf{u}^+_p, \mathbf{x}_p,-\alpha)$ (selected in this order) as there is flow crossing $\mathbf{u}^-_p$ in $f$; +\item As many copies of $(\mathbf{x}_p, \mathbf{u}^-_p,0)$ as there is flow through $\mathbf{u}^-_p$; +\item As many copies of $(\mathbf{u}^-_p,\mathbf{x}_p,0)$ so that the number of arcs between these two vertices is 2; +\item $(\mathbf{u}^+_p,\mathbf{y}_{p,z}, 0)$ if the flow between these vertices is 1, and the opposite arc otherwise; +\item as many copies of $(\mathbf{u}^-_p,\mathbf{y}_{p,z}, 0)$ as the flow between these vertices, and as many copies of the opposite arc as 2~$-$~the flow; +\item $(\mathbf{y}_{p,z},\mathbf{z}_v, \pm1)$ if it is saturated in $f$, with $+1$ if $v\in T'_p$ and $-1$ otherwise; +\item $(\mathbf{z}_v,\mathbf{y}_{p,z}, \pm1)$ if it is not saturated in $f$, with $+1$ if $v\notin T'_p$ and $-1$ otherwise. +\end{itemize} +To summarize, arcs are oriented left to right if they correspond to a presence of flow in $f$, and right to left if they correspond to an absence of flow. They are positively weighted if we want them to stay at their current state, and negatively if we want them to switch. Let us compute the weight of such graph. + +\begin{multline*} + w(G_f) = \sum_{e\in E_f} w(e_f) \\ + = + (\alpha - \beta -\gamma) N_1 + (\alpha +\beta - \gamma) N_2 + (\alpha+\beta+\gamma) N_3 + \\ + + \#V\times N - 4 \sum_p 3-\#(T_p\cap T'_p) \\ + =(\#V-12+\alpha-\beta-\gamma)\times N + 4Q_V + 2\beta N_2 + 2(\beta+\gamma) N_3 \\ +\end{multline*} + +As for the mode 3-strict, one can check that the difference of two such graphs corresponding to the same $(n_v)$ is always eulerian. Hence we can navigate in this class with the same greedy algorithm that discovers positive cycles and flips them. + +The function that we optimize is +$$ +2Q_V + \beta N_2 + (\beta+\gamma) N_3. +$$ +The choice of parameters $\beta$ and $\gamma$ should be lead by the following question: For $\beta$, where to put the tradeoff between zone dispersion and distance to the previous configuration? For $\gamma$, do we prefer to have more partitions spread between 2 zones, or have less between at least 2 zones but more between 3 zones. + +The quantity $Q_V$ varies between $0$ and $3N$, it should be of order $N$. The quantity $N_2+N_3$ should also be of order $N$ (it is exactly $N$ in the strict mode). So the two terms of the function are comparable. + \end{document} -- cgit v1.2.3 From d38fb6c2500c20cc6fabf3192fa7c136675788c5 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 8 Sep 2022 12:43:33 +0200 Subject: ignore log files in commit --- doc/optimal_layout_report/.gitignore | 4 + doc/optimal_layout_report/optimal_layout.aux | 35 --- doc/optimal_layout_report/optimal_layout.log | 298 --------------------- .../optimal_layout.synctex.gz | Bin 107678 -> 0 bytes doc/optimal_layout_report/optimal_layout.tex | 7 + 5 files changed, 11 insertions(+), 333 deletions(-) create mode 100644 doc/optimal_layout_report/.gitignore delete mode 100644 doc/optimal_layout_report/optimal_layout.aux delete mode 100644 doc/optimal_layout_report/optimal_layout.log delete mode 100644 doc/optimal_layout_report/optimal_layout.synctex.gz diff --git a/doc/optimal_layout_report/.gitignore b/doc/optimal_layout_report/.gitignore new file mode 100644 index 00000000..3bd5cbf6 --- /dev/null +++ b/doc/optimal_layout_report/.gitignore @@ -0,0 +1,4 @@ +optimal_layout.aux +optimal_layout.log +optimal_layout.synctex.gz + diff --git a/doc/optimal_layout_report/optimal_layout.aux b/doc/optimal_layout_report/optimal_layout.aux deleted file mode 100644 index 9e779514..00000000 --- a/doc/optimal_layout_report/optimal_layout.aux +++ /dev/null @@ -1,35 +0,0 @@ -\relax -\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Context}{1}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Formal description of the problem}{1}{}\protected@file@percent } -\newlabel{eq:optimal}{{{OPT}}{1}} -\@writefile{toc}{\contentsline {section}{\numberline {2}Properties of an optimal 3-strict assignment}{2}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Optimal assignment}{2}{}\protected@file@percent } -\newlabel{sec:opt_assign}{{2.1}{2}} -\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces On the left, the creation of a concrete assignment with the naive approach of repeating tokens. On the right, the zones containing the nodes.}}{4}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Minimal transfer}{5}{}\protected@file@percent } -\newlabel{hyp:A}{{{H3A}}{5}} -\newlabel{hyp:B}{{{H3B}}{5}} -\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Flow problem to compute and optimal assignment.}}{5}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {A)}Minimizing the zone discrepancy}{6}{}\protected@file@percent } -\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces On the left: the graph $G_T$ encoding an assignment to minimize the zone discrepancy. On the right: the graph $G_T$ encoding an assignment to minimize the node discrepancy.}}{7}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {B)}Minimizing the node discrepancy}{8}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsubsection}{\numberline {C)}Linear combination of both criteria}{9}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Algorithm}{9}{}\protected@file@percent } -\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Optimal 3-strict assignment}}{9}{}\protected@file@percent } -\newlabel{alg:total}{{1}{9}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Computation of the optimal utilization}}{10}{}\protected@file@percent } -\newlabel{alg:util}{{2}{10}} -\newlabel{lin:subutil}{{2}{10}} -\newlabel{lin:loopsub}{{3}{10}} -\newlabel{lin:findmin}{{4}{10}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Computation of a candidate assignment}}{11}{}\protected@file@percent } -\newlabel{alg:opt}{{3}{11}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Minimization of the number of transfers}}{11}{}\protected@file@percent } -\newlabel{alg:mini}{{4}{11}} -\newlabel{lin:repeat}{{3}{11}} -\@writefile{toc}{\contentsline {section}{\numberline {3}Computation of a 3-non-strict assignment}{11}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Choices of optimality}{11}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Computation of a candidate assignment}{11}{}\protected@file@percent } -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Maximal spread and minimal transfers}{12}{}\protected@file@percent } -\gdef \@abspage@last{13} diff --git a/doc/optimal_layout_report/optimal_layout.log b/doc/optimal_layout_report/optimal_layout.log deleted file mode 100644 index 1bce9627..00000000 --- a/doc/optimal_layout_report/optimal_layout.log +++ /dev/null @@ -1,298 +0,0 @@ -This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020/Debian) (preloaded format=pdflatex 2022.6.23) 19 AUG 2022 21:20 -entering extended mode - restricted \write18 enabled. - %&-line parsing enabled. -**optimal_layout.tex -(./optimal_layout.tex -LaTeX2e <2020-10-01> patch level 4 -L3 programming layer <2021-01-09> xparse <2020-03-03> -(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls -Document Class: article 2020/04/10 v1.4m Standard LaTeX document class -(/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo -File: size10.clo 2020/04/10 v1.4m Standard LaTeX file (size option) -) -\c@part=\count177 -\c@section=\count178 -\c@subsection=\count179 -\c@subsubsection=\count180 -\c@paragraph=\count181 -\c@subparagraph=\count182 -\c@figure=\count183 -\c@table=\count184 -\abovecaptionskip=\skip47 -\belowcaptionskip=\skip48 -\bibindent=\dimen138 -) -(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsmath.sty -Package: amsmath 2020/09/23 v2.17i AMS math features -\@mathmargin=\skip49 - -For additional information on amsmath, use the `?' option. -(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amstext.sty -Package: amstext 2000/06/29 v2.01 AMS text - -(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsgen.sty -File: amsgen.sty 1999/11/30 v2.0 generic functions -\@emptytoks=\toks15 -\ex@=\dimen139 -)) -(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsbsy.sty -Package: amsbsy 1999/11/29 v1.2d Bold Symbols -\pmbraise@=\dimen140 -) -(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsopn.sty -Package: amsopn 2016/03/08 v2.02 operator names -) -\inf@bad=\count185 -LaTeX Info: Redefining \frac on input line 234. -\uproot@=\count186 -\leftroot@=\count187 -LaTeX Info: Redefining \overline on input line 399. -\classnum@=\count188 -\DOTSCASE@=\count189 -LaTeX Info: Redefining \ldots on input line 496. -LaTeX Info: Redefining \dots on input line 499. -LaTeX Info: Redefining \cdots on input line 620. -\Mathstrutbox@=\box47 -\strutbox@=\box48 -\big@size=\dimen141 -LaTeX Font Info: Redeclaring font encoding OML on input line 743. -LaTeX Font Info: Redeclaring font encoding OMS on input line 744. -\macc@depth=\count190 -\c@MaxMatrixCols=\count191 -\dotsspace@=\muskip16 -\c@parentequation=\count192 -\dspbrk@lvl=\count193 -\tag@help=\toks16 -\row@=\count194 -\column@=\count195 -\maxfields@=\count196 -\andhelp@=\toks17 -\eqnshift@=\dimen142 -\alignsep@=\dimen143 -\tagshift@=\dimen144 -\tagwidth@=\dimen145 -\totwidth@=\dimen146 -\lineht@=\dimen147 -\@envbody=\toks18 -\multlinegap=\skip50 -\multlinetaggap=\skip51 -\mathdisplay@stack=\toks19 -LaTeX Info: Redefining \[ on input line 2923. -LaTeX Info: Redefining \] on input line 2924. -) -(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amssymb.sty -Package: amssymb 2013/01/14 v3.01 AMS font symbols - -(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amsfonts.sty -Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support -\symAMSa=\mathgroup4 -\symAMSb=\mathgroup5 -LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. -LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' -(Font) U/euf/m/n --> U/euf/b/n on input line 106. -)) -(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphicx.sty -Package: graphicx 2020/09/09 v1.2b Enhanced LaTeX Graphics (DPC,SPQR) - -(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty -Package: keyval 2014/10/28 v1.15 key=value parser (DPC) -\KV@toks@=\toks20 -) -(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphics.sty -Package: graphics 2020/08/30 v1.4c Standard LaTeX Graphics (DPC,SPQR) - -(/usr/share/texlive/texmf-dist/tex/latex/graphics/trig.sty -Package: trig 2016/01/03 v1.10 sin cos tan (DPC) -) -(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/graphics.cfg -File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration -) -Package graphics Info: Driver file: pdftex.def on input line 105. - -(/usr/share/texlive/texmf-dist/tex/latex/graphics-def/pdftex.def -File: pdftex.def 2020/10/05 v1.2a Graphics/color driver for pdftex -)) -\Gin@req@height=\dimen148 -\Gin@req@width=\dimen149 -) -(/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty -Package: xcolor 2016/05/11 v2.12 LaTeX color extensions (UK) - -(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/color.cfg -File: color.cfg 2016/01/02 v1.6 sample color configuration -) -Package xcolor Info: Driver file: pdftex.def on input line 225. -Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1348. -Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1352. -Package xcolor Info: Model `RGB' extended on input line 1364. -Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1366. -Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1367. -Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1368. -Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1369. -Package xcolor Info: Model `Gray' substituted by `gray' on input line 1370. -Package xcolor Info: Model `wave' substituted by `hsb' on input line 1371. -) -(/usr/share/texlive/texmf-dist/tex/latex/algorithms/algorithm.sty -Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating enviro -nment - -(/usr/share/texlive/texmf-dist/tex/latex/float/float.sty -Package: float 2001/11/08 v1.3d Float enhancements (AL) -\c@float@type=\count197 -\float@exts=\toks21 -\float@box=\box49 -\@float@everytoks=\toks22 -\@floatcapt=\box50 -) -(/usr/share/texlive/texmf-dist/tex/latex/base/ifthen.sty -Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC) -) -\@float@every@algorithm=\toks23 -\c@algorithm=\count198 -) -(/usr/share/texlive/texmf-dist/tex/latex/algorithmicx/algpseudocode.sty -Package: algpseudocode - -(/usr/share/texlive/texmf-dist/tex/latex/algorithmicx/algorithmicx.sty -Package: algorithmicx 2005/04/27 v1.2 Algorithmicx - -Document Style algorithmicx 1.2 - a greatly improved `algorithmic' style -\c@ALG@line=\count199 -\c@ALG@rem=\count266 -\c@ALG@nested=\count267 -\ALG@tlm=\skip52 -\ALG@thistlm=\skip53 -\c@ALG@Lnr=\count268 -\c@ALG@blocknr=\count269 -\c@ALG@storecount=\count270 -\c@ALG@tmpcounter=\count271 -\ALG@tmplength=\skip54 -) -Document Style - pseudocode environments for use with the `algorithmicx' style -) (/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def -File: l3backend-pdftex.def 2020-01-29 L3 backend support: PDF output (pdfTeX) -\l__color_backend_stack_int=\count272 -\l__pdf_internal_box=\box51 -) -(./optimal_layout.aux) -\openout1 = `optimal_layout.aux'. - -LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 15. -LaTeX Font Info: ... okay on input line 15. -LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 15. -LaTeX Font Info: ... okay on input line 15. -LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 15. -LaTeX Font Info: ... okay on input line 15. -LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 15. -LaTeX Font Info: ... okay on input line 15. -LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 15. -LaTeX Font Info: ... okay on input line 15. -LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 15. -LaTeX Font Info: ... okay on input line 15. -LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 15. -LaTeX Font Info: ... okay on input line 15. - -(/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii -[Loading MPS to PDF converter (version 2006.09.02).] -\scratchcounter=\count273 -\scratchdimen=\dimen150 -\scratchbox=\box52 -\nofMPsegments=\count274 -\nofMParguments=\count275 -\everyMPshowfont=\toks24 -\MPscratchCnt=\count276 -\MPscratchDim=\dimen151 -\MPnumerator=\count277 -\makeMPintoPDFobject=\count278 -\everyMPtoPDFconversion=\toks25 -) (/usr/share/texlive/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty -Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf -Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4 -85. - -(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg -File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv -e -)) -LaTeX Font Info: Trying to load font information for U+msa on input line 17. - - -(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsa.fd -File: umsa.fd 2013/01/14 v3.01 AMS symbols A -) -LaTeX Font Info: Trying to load font information for U+msb on input line 17. - - -(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsb.fd -File: umsb.fd 2013/01/14 v3.01 AMS symbols B -) [1 - -{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] [2] [3] - -File: figures/naive.pdf Graphic file (type pdf) - -Package pdftex.def Info: figures/naive.pdf used on input line 121. -(pdftex.def) Requested size: 310.4979pt x 116.6252pt. - -File: figures/flow.pdf Graphic file (type pdf) - -Package pdftex.def Info: figures/flow.pdf used on input line 140. -(pdftex.def) Requested size: 207.0021pt x 104.94873pt. - [4 <./figures/naive.pdf>] [5 <./figures/flow.pdf>] [6] - -File: figures/mini_zone.pdf Graphic file (type pdf) - -Package pdftex.def Info: figures/mini_zone.pdf used on input line 225. -(pdftex.def) Requested size: 110.39873pt x 138.8974pt. - -File: figures/mini_node.pdf Graphic file (type pdf) - -Package pdftex.def Info: figures/mini_node.pdf used on input line 229. -(pdftex.def) Requested size: 151.8014pt x 157.28752pt. - -Overfull \hbox (6.52959pt too wide) in paragraph at lines 243--244 -[]\OT1/cmr/m/n/10 Assume that their ex-ist some as-sign-ment $\OML/cmm/m/it/10 -T[]$ \OT1/cmr/m/n/10 with the same uti-liza-tion $(\OML/cmm/m/it/10 n[]\OT1/cmr -/m/n/10 )[]$. - [] - -[7 <./figures/mini_zone.pdf> <./figures/mini_node.pdf - -pdfTeX warning: /usr/bin/pdflatex (file ./figures/mini_node.pdf): PDF inclusion -: multiple pdfs with page group included in a single page ->] [8] [9] [10] [11] [12] [13] (./optimal_layout.aux) ) -Here is how much of TeX's memory you used: - 3544 strings out of 481176 - 47263 string characters out of 5914226 - 336215 words of memory out of 5000000 - 20458 multiletter control sequences out of 15000+600000 - 413592 words of font info for 65 fonts, out of 8000000 for 9000 - 59 hyphenation exceptions out of 8191 - 68i,12n,74p,880b,308s stack positions out of 5000i,500n,10000p,200000b,80000s - < -/usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/cmr17.pfb> -Output written on optimal_layout.pdf (13 pages, 289460 bytes). -PDF statistics: - 135 PDF objects out of 1000 (max. 8388607) - 96 compressed objects within 1 object stream - 0 named destinations out of 1000 (max. 500000) - 21 words of extra memory for PDF output out of 10000 (max. 10000000) - diff --git a/doc/optimal_layout_report/optimal_layout.synctex.gz b/doc/optimal_layout_report/optimal_layout.synctex.gz deleted file mode 100644 index 59241b07..00000000 Binary files a/doc/optimal_layout_report/optimal_layout.synctex.gz and /dev/null differ diff --git a/doc/optimal_layout_report/optimal_layout.tex b/doc/optimal_layout_report/optimal_layout.tex index 594c7ecc..cb0d2479 100644 --- a/doc/optimal_layout_report/optimal_layout.tex +++ b/doc/optimal_layout_report/optimal_layout.tex @@ -462,6 +462,13 @@ The choice of parameters $\beta$ and $\gamma$ should be lead by the following qu The quantity $Q_V$ varies between $0$ and $3N$, it should be of order $N$. The quantity $N_2+N_3$ should also be of order $N$ (it is exactly $N$ in the strict mode). So the two terms of the function are comparable. +\section{TODO} + +Ajouter des affichages, voir https://pad.deuxfleurs.fr/pad/#/2/pad/view/rrKyASaaGKDIX4QICZCMP4f50M+nq5EMCvfvFQOsyXw/ + + \end{document} + + -- cgit v1.2.3 From c4adbeed515c571369453d23c7f1d84b1db994ec Mon Sep 17 00:00:00 2001 From: Mendes Date: Sat, 10 Sep 2022 13:51:12 +0200 Subject: Added the section with description proofs of the parametric assignment computation in the optimal layout report --- doc/optimal_layout_report/.gitignore | 3 +- doc/optimal_layout_report/optimal_layout.bib | 11 ++ doc/optimal_layout_report/optimal_layout.pdf | Bin 289460 -> 395187 bytes doc/optimal_layout_report/optimal_layout.tex | 258 +++++++++++++++++++++++++-- 4 files changed, 260 insertions(+), 12 deletions(-) create mode 100644 doc/optimal_layout_report/optimal_layout.bib diff --git a/doc/optimal_layout_report/.gitignore b/doc/optimal_layout_report/.gitignore index 3bd5cbf6..52deb7ad 100644 --- a/doc/optimal_layout_report/.gitignore +++ b/doc/optimal_layout_report/.gitignore @@ -1,4 +1,5 @@ optimal_layout.aux optimal_layout.log optimal_layout.synctex.gz - +optimal_layout.bbl +optimal_layout.blg diff --git a/doc/optimal_layout_report/optimal_layout.bib b/doc/optimal_layout_report/optimal_layout.bib new file mode 100644 index 00000000..9552b11d --- /dev/null +++ b/doc/optimal_layout_report/optimal_layout.bib @@ -0,0 +1,11 @@ + +@article{even1975network, + title={Network flow and testing graph connectivity}, + author={Even, Shimon and Tarjan, R Endre}, + journal={SIAM journal on computing}, + volume={4}, + number={4}, + pages={507--518}, + year={1975}, + publisher={SIAM} +} diff --git a/doc/optimal_layout_report/optimal_layout.pdf b/doc/optimal_layout_report/optimal_layout.pdf index 667798fe..c85803e8 100644 Binary files a/doc/optimal_layout_report/optimal_layout.pdf and b/doc/optimal_layout_report/optimal_layout.pdf differ diff --git a/doc/optimal_layout_report/optimal_layout.tex b/doc/optimal_layout_report/optimal_layout.tex index cb0d2479..b2898adb 100644 --- a/doc/optimal_layout_report/optimal_layout.tex +++ b/doc/optimal_layout_report/optimal_layout.tex @@ -1,6 +1,7 @@ \documentclass[]{article} \usepackage{amsmath,amssymb} +\usepackage{amsthm} \usepackage{graphicx,xcolor} @@ -8,9 +9,11 @@ \renewcommand\thesubsubsection{\Alph{subsubsection})} +\newtheorem{proposition}{Proposition} + %opening \title{Optimal partition assignment in Garage} -\author{Mendes Oulamara} +\author{Mendes} \begin{document} @@ -22,25 +25,25 @@ Garage is an open-source distributed storage service blablabla$\dots$ -Every object to be stored in the system falls in a partition given by the last $k$ bits of its hash. There are $N=2^k$ partitions. Every partition will be stored on distinct nodes of the system. The goal of the assignment of partitions to nodes is to ensure (nodes and zone) redundancy and to be as efficient as possible. +Every object to be stored in the system falls in a partition given by the last $k$ bits of its hash. There are $P=2^k$ partitions. Every partition will be stored on distinct nodes of the system. The goal of the assignment of partitions to nodes is to ensure (nodes and zone) redundancy and to be as efficient as possible. \subsection{Formal description of the problem} -We are given a set of nodes $V$ and a set of zones $Z$. Every node $v$ has a non-negative storage capacity $c_v\ge 0$ and belongs to a zone $z_v\in Z$. We are also given a number of partition $N>0$ (typically $N=256$). +We are given a set of nodes $\mathbf{N}$ and a set of zones $\mathbf{Z}$. Every node $n$ has a non-negative storage capacity $c_n\ge 0$ and belongs to a zone $z\in \mathbf{Z}$. We are also given a number of partition $P>0$ (typically $P=256$). -We would like to compute an assignment of three nodes to every partition. That is, for every $1\le i\le N$, we compute a triplet of three distinct nodes $T_i=(T_i^1, T_i^2, T_i^3) \in V^3$. We will impose some redundancy constraints to this assignment, and under these constraints, we want our system to have the largest storage capacity possible. To link storage capacity to partition assignment, we make the following assumption: +We would like to compute an assignment of nodes to partitions. We will impose some redundancy constraints to this assignment, and under these constraints, we want our system to have the largest storage capacity possible. To link storage capacity to partition assignment, we make the following assumption: \begin{equation} \tag{H1} \text{\emph{All partitions have the same size $s$.}} \end{equation} This assumption is justified by the dispersion of the hashing function, when the number of partitions is small relative to the number of stored large objects. -Every node $v$ needs to store $n_v = \#\{ 1\le i\le N ~|~ v\in T_i \}$ partitions (where $\#$ denots the number of indices in the set). Hence the partitions stored by $v$ (and hence all partitions by our assumption) have there size bounded by $c_v/n_v$. This remark leads us to define the optimal size that we will want to maximize: +Every node $n$ wille store some number $k_n$ of partitions. Hence the partitions stored by $n$ (and hence all partitions by our assumption) have there size bounded by $c_n/k_n$. This remark leads us to define the optimal size that we will want to maximize: \begin{equation} \label{eq:optimal} \tag{OPT} -s^* = \min_{v \in V} \frac{c_v}{n_v}. +s^* = \min_{n \in N} \frac{c_n}{k_n}. \end{equation} When the capacities of the nodes are updated (this includes adding or removing a node), we want to update the assignment as well. However, transferring the data between nodes has a cost and we would like to limit the number of changes in the assignment. We make the following assumption: @@ -52,11 +55,246 @@ This assumption justifies that when we compute the new assignment, it is worth t For now, in the following, we ask the following redundancy constraint: +\textbf{Parametric node and zone redundancy:} Given two integer parameters $1\le \rho_\mathbf{Z} \le \rho_\mathbf{N}$, we ask every partition to be stored on $\rho_\mathbf{N}$ distinct nodes, and these nodes must belong to at least $\rho_\mathbf{Z}$ distinct zones. + + \textbf{Mode 3-strict:} every partition needs to be assignated to three nodes belonging to three different zones. \textbf{Mode 3:} every partition needs to be assignated to three nodes. We try to spread the three nodes over different zones as much as possible. -\textbf{Remark: (TODO):} The algorithms below directly adapt to a redundancy of $r$ instead of 3. +\textbf{Warning:} This is a working document written incrementaly. The last version of the algorithm is the \textbf{parametric assignment} described in the next section. + + +\section{Computation of a parametric assignment} +\textbf{Attention : }We change notations in this section. + +Notations : let $P$ be the number of partitions, $N$ the number of nodes, $Z$ the number of zones. Let $\mathbf{P,N,Z}$ be the label sets of, respectively, partitions, nodes and zones. +Let $s^*$ be the largest partition size achievable with the redundancy constraints. Let $(c_n)_{n\in \mathbf{N}}$ be the storage capacity of every node. + +In this section, we propose a third specification of the problem. The user inputs two redundancy parameters $1\le \rho_\mathbf{Z} \le \rho_\mathbf{N}$. We compute an assignment $\alpha = (\alpha_p^1, \ldots, \alpha_p^{\rho_\mathbf{N}})_{p\in \mathbf{P}}$ such that every partition $p$ is associated to $\rho_\mathbf{N}$ distinct nodes $\alpha_p^1, \ldots, \alpha_p^{\rho_\mathbf{N}}$ and these nodes belong to at least $\rho_\mathbf{Z}$ distinct zones. + +If the layout contained a previous assignment $\alpha'$, we try to minimize the amount of data to transfer during the layout update by making $\alpha$ as close as possible to $\alpha'$. + +In the following subsections, we describe the successive steps of the algorithm we propose to compute $\alpha$. + +\subsubsection*{Algorithm} + +\begin{algorithmic}[1] + \Function{Compute Layout}{$\mathbf{N}$, $\mathbf{Z}$, $\mathbf{P}$, $(c_n)_{n\in \mathbf{N}}$, $\rho_\mathbf{N}$, $\rho_\mathbf{Z}$, $\alpha'$} + \State $s^* \leftarrow$ \Call{Compute Partition Size}{$\mathbf{N}$, $\mathbf{Z}$, $\mathbf{P}$, $(c_n)_{n\in \mathbf{N}}$, $\rho_\mathbf{N}$, $\rho_\mathbf{Z}$} + \State $G \leftarrow G(s^*)$ + \State $f \leftarrow$ \Call{Compute Candidate Assignment}{$G$, $\alpha'$} + \State $f^* \leftarrow$ \Call{Minimize transfer load}{$G$, $f$, $\alpha'$} + \State Build $\alpha^*$ from $f^*$ + \State \Return $\alpha^*$ + \EndFunction +\end{algorithmic} + +\subsubsection*{Complexity} +As we will see in the next sections, the worst case complexity of this algorithm is $O(P^2 N^2)$. The minimization of transfer load is the most expensive step, and it can run with a timeout since it is only an optimization step. Without this step (or with a smart timeout), the worst cas complexity can be $O((PN)^{3/2}\log C)$ where $C$ is the total storage capacity of the cluster. + +\subsection{Determination of the partition size $s^*$} + +Again, we will represent an assignment $\alpha$ as a flow in a specific graph $G$. We will not compute the optimal partition size $s^*$ a priori, but we will determine it by dichotomy, as the largest size $s$ such that the maximal flow achievable on $G=G(s)$ has value $\rho_\mathbf{N}P$. We will assume that the capacities are given in a small enough unit (say, Megabytes), and we will determine $s^*$ at the precision of the given unit. + +Given some candidate size value $s$, we describe the oriented weighted graph $G=(V,E)$ with vertex set $V$ arc set $E$. + +The set of vertices $V$ contains the source $\mathbf{s}$, the sink $\mathbf{t}$, vertices +$\mathbf{p, p^+, p^-}$ for every partition $p$, vertices $\mathbf{x}_{p,z}$ for every partition $p$ and zone $z$, and vertices $\mathbf{n}$ for every node $n$. + +The set of arcs $E$ contains: +\begin{itemize} + \item ($\mathbf{s}$,$\mathbf{p}$, $\rho_\mathbf{N}$) for every partition $p$; + \item ($\mathbf{p}$,$\mathbf{p}^+$, $\rho_\mathbf{Z}$) for every partition $p$; + \item ($\mathbf{p}$,$\mathbf{p}^+$, $\rho_\mathbf{N}-\rho_\mathbf{Z}$) for every partition $p$; + \item ($\mathbf{p}^+$,$\mathbf{x}_{p,z}$, 1) for every partition $p$ and zone $z$; + \item ($\mathbf{p}^-$,$\mathbf{x}_{p,z}$, $\rho_\mathbf{N}-\rho_\mathbf{Z}$) for every partition $p$ and zone $z$; + \item ($\mathbf{x}_{p,z}$,$\mathbf{n}$, 1) for every partition $p$, zone $z$ and node $n\in z$; + \item ($\mathbf{n}$, $\mathbf{t}$, $\lfloor c_n/s \rfloor$) for every node $n$. +\end{itemize} + +In the following complexity calculations, we will use the number of vertices and edges of $G$. Remark from now that $\# V = O(PZ)$ and $\# E = O(PN)$. + +\begin{proposition} + An assignment $\alpha$ is realizable with partition size $s$ and the redundancy constraints $(\rho_\mathbf{N},\rho_\mathbf{Z})$ if and only if there exists a maximal flow function $f$ in $G$ with total flow $\rho_\mathbf{N}P$, such that the arcs ($\mathbf{x}_{p,z}$,$\mathbf{n}$, 1) used are exactly those for which $p$ is associated to $n$ in $\alpha$. +\end{proposition} +\begin{proof} + Given such flow $f$, we can reconstruct a candidate $\alpha$. In $f$, the flow passing through every $\mathbf{p}$ is $\rho_\mathbf{N}$, and since the outgoing capacity of every $\mathbf{x}_{p,z}$ is 1, every partition is associated to $\rho_\mathbf{N}$ distinct nodes. The fraction $\rho_\mathbf{Z}$ of the flow passing through every $\mathbf{p^+}$ must be spread over as many distinct zones as every arc outgoing from $\mathbf{p^+}$ has capacity 1. So the reconstructed $\alpha$ verifies the redundancy constraints. For every node $n$, the flow between $\mathbf{n}$ and $\mathbf{t}$ corresponds to the number of partitions associated to $n$. By construction of $f$, this does not exceed $\lfloor c_n/s \rfloor$. We assumed that the partition size is $s$, hence this association does not exceed the storage capacity of the nodes. + + In the other direction, given an assignment $\alpha$, one can similarly check that the facts that $\alpha$ respects the redundancy constraints, and the storage capacities of the nodes, are necessary condition to construct a maximal flow function $f$. +\end{proof} + +\textbf{Implementation remark:} In the flow algorithm, while exploring the graph, we explore the neighbours of every vertex in a random order to heuristically spread the association between nodes and partitions. + +\subsubsection*{Algorithm} +With this result mind, we can describe the first step of our algorithm. All divisions are supposed to be integer division. +\begin{algorithmic}[1] + \Function{Compute Partition Size}{$\mathbf{N}$, $\mathbf{Z}$, $\mathbf{P}$, $(c_n)_{n\in \mathbf{N}}$, $\rho_\mathbf{N}$, $\rho_\mathbf{Z}$} + + \State Build the graph $G=G(s=1)$ + \State $ f \leftarrow$ \Call{Maximal flow}{$G$} + \If{$f.\mathrm{total flow} < \rho_\mathbf{N}P$} + + \State \Return Error: capacities too small or constraints too strong. + \EndIf + + \State $s^- \leftarrow 1$ + \State $s^+ \leftarrow 1+\frac{1}{\rho_\mathbf{N}}\sum_{n \in \mathbf{N}} c_n$ + + \While{$s^-+1 < s^+$} + \State Build the graph $G=G(s=(s^-+s^+)/2)$ + \State $ f \leftarrow$ \Call{Maximal flow}{$G$} + \If{$f.\mathrm{total flow} < \rho_\mathbf{N}P$} + \State $s^+ \leftarrow (s^- + s^+)/2$ + \Else + \State $s^- \leftarrow (s^- + s^+)/2$ + \EndIf + \EndWhile + + \State \Return $s^-$ + \EndFunction +\end{algorithmic} + +\subsubsection*{Complexity} + +To compute the maximal flow, we use Dinic's algorithm. Its complexity on general graphs is $O(\#V^2 \#E)$, but on graphs with edge capacity bounded by a constant, it turns out to be $O(\#E^{3/2})$. The graph $G$ does not fall in this case since the capacities of the arcs incoming to $\mathbf{t}$ are far from bounded. However, the proof of this complexity works readily for graph where we only ask the edges \emph{not} incoming to the sink $\mathbf{t}$ to have their capacities bounded by a constant. One can find the proof of this claim in \cite[Section 2]{even1975network}. +The dichotomy adds a logarithmic factor $\log (C)$ where $C=\sum_{n \in \mathbf{N}} c_n$ is the total capacity of the cluster. The total complexity of this first function is hence +$O(\#E^{3/2}\log C ) = O\big((PN)^{3/2} \log C\big)$. + +\subsubsection*{Metrics} +We can display the discrepancy between the computed $s^*$ and the best size we could hope for a given total capacity, that is $C/\rho_\mathbf{N}$. + +\subsection{Computation of a candidate assignment} + +Now that we have the optimal partition size $s^*$, to compute a candidate assignment, it would be enough to compute a maximal flow function $f$ on $G(s^*)$. This is what we do if there was no previous assignment $\alpha'$. + +If there was some $\alpha'$, we add a step that will heuristically help to obtain a candidate $\alpha$ closer to $\alpha'$. to do so, we fist compute a flow function $\tilde{f}$ that uses only the partition-to-node association appearing in $\alpha'$. Most likely, $\tilde{f}$ will not be a maximal flow of $G(s^*)$. In Dinic's algorithm, we can start from a non maximal flow function and then discover improving paths. This is what we do in starting from $\tilde{f}$. The hope\footnote{This is only a hope, because one can find examples where the construction of $f$ from $\tilde{f}$ produces an assignment $\alpha$ that is not as close as possible to $\alpha'$.} is that the final flow function $f$ will tend to keep the associations appearing in $\tilde{f}$. + +More formally, we construct the graph $G_{|\alpha'}$ from $G$ by removing all the arcs $(\mathbf{x}_{p,z},\mathbf{n}, 1)$ where $p$ is not associated to $n$ in $\alpha'$. We compute a maximal flow function $\tilde{f}$ in $G_{|\alpha'}$. $\tilde{f}$ is also a valid (most likely non maximal) flow function in $G$. We compute a maximal flow function $f$ on $G$ by starting Dinic's algorithm on $\tilde{f}$. + +\subsubsection*{Algorithm} +\begin{algorithmic}[1] + \Function{Compute Candidate Assignment}{$G$, $\alpha'$} + \State Build the graph $G_{|\alpha'}$ + \State $ \tilde{f} \leftarrow$ \Call{Maximal flow}{$G_{|\alpha'}$} + \State $ f \leftarrow$ \Call{Maximal flow from flow}{$G$, $\tilde{f}$} + \State \Return $f$ + \EndFunction +\end{algorithmic} + +\textbf{Remark:} The function ``Maximal flow'' can be just seen as the function ``Maximal flow from flow'' called with the zero flow function as starting flow. + +\subsubsection*{Complexity} +From the consideration of the last section, we have the complexity of the Dinic's algorithm $O(\#E^{3/2}) = O((PN)^{3/2})$. + +\subsubsection*{Metrics} + +We can display the flow value of $\tilde{f}$, which is an upper bound of the distance between $\alpha$ and $\alpha'$. It might be more a Debug level display than Info. + +\subsection{Minimization of the transfer load} + +Now that we have a candidate flow function $f$, we want to modify it to make its associated assignment as close as possible to $\alpha'$. Denote by $f'$ the maximal flow associated to $\alpha'$, and let $d(f, f')$ be distance between the associated assignments\footnote{It is the number of arcs of type $(\mathbf{x}_{p,z},\mathbf{n})$ saturated in one flow and not in the other.}. +We want to build a sequence $f=f_0, f_1, f_2 \dots$ of maximal flows such that $d(f_i, \alpha')$ decreases as $i$ increases. The distance being a non-negative integer, this sequence of flow functions must be finite. We now explain how to find some improving $f_{i+1}$ from $f_i$. + +For any maximal flow $f$ in $G$, we define the oriented weighted graph $G_f=(V, E_f)$ as follows. The vertices of $G_f$ are the same as the vertices of $G$. $E_f$ contains the arc $(v_1,v_2, w)$ between vertices $v_1,v_2\in V$ with weight $w$ if and only if the arc $(v_1,v_2)$ is not saturated in $f$ (i.e. $c(v_1,v_2)-f(v_1,v_2) \ge 1$, we also consider reversed arcs). The weight $w$ is: +\begin{itemize} + \item $-1$ if $(v_1,v_2)$ is of type $(\mathbf{x}_{p,z},\mathbf{n})$ or $(\mathbf{x}_{p,z},\mathbf{n})$ and is saturated in only one of the two flows $f,f'$; + \item $+1$ if $(v_1,v_2)$ is of type $(\mathbf{x}_{p,z},\mathbf{n})$ or $(\mathbf{x}_{p,z},\mathbf{n})$ and is saturated in either both or none of the two flows $f,f'$; + \item $0$ otherwise. +\end{itemize} + +If $\gamma$ is a simple cycle of arcs in $G_f$, we define its weight $w(\gamma)$ as the sum of the weights of its arcs. We can add $+1$ to the value of $f$ on the arcs of $\gamma$, and by construction of $G_f$ and the fact that $\gamma$ is a cycle, the function that we get is still a valid flow function on $G$, it is maximal as it has the same flow value as $f$. We denote this new function $f+\gamma$. + +\begin{proposition} + Given a maximal flow $f$ and a simple cycle $\gamma$ in $G_f$, we have $d(f+\gamma, f') - d(f,f') = w(\gamma)$. +\end{proposition} +\begin{proof} + Let $X$ be the set of arcs of type $(\mathbf{x}_{p,z},\mathbf{n})$. Then we can express $d(f,f')$ as + \begin{align*} + d(f,f') & = \#\{e\in X ~|~ f(e)\neq f'(e)\} + = \sum_{e\in X} 1_{f(e)\neq f'(e)} \\ + & = \frac{1}{2}\big( \#X + \sum_{e\in X} 1_{f(e)\neq f'(e)} - 1_{f(e)= f'(e)} \big). + \end{align*} + We can express the cycle weight as + \begin{align*} + w(\gamma) & = \sum_{e\in X, e\in \gamma} - 1_{f(e)\neq f'(e)} + 1_{f(e)= f'(e)}. + \end{align*} + Remark that since we passed on unit of flow in $\gamma$ to construct $f+\gamma$, we have for any $e\in X$, $f(e)=f'(e)$ if and only if $(f+\gamma)(e) \neq f'(e)$. + Hence + \begin{align*} + w(\gamma) & = \frac{1}{2}(w(\gamma) + w(\gamma)) \\ + &= \frac{1}{2} \Big( + \sum_{e\in X, e\in \gamma} - 1_{f(e)\neq f'(e)} + 1_{f(e)= f'(e)} \\ + & \qquad + + \sum_{e\in X, e\in \gamma} 1_{(f+\gamma)(e)\neq f'(e)} + 1_{(f+\gamma)(e)= f'(e)} + \Big). + \end{align*} + Plugging this in the previous equation, we find that + $$d(f,f')+w(\gamma) = d(f+\gamma, f').$$ +\end{proof} + +This result suggests that given some flow $f_i$, we just need to find a negative cycle $\gamma$ in $G_{f_i}$ to construct $f_{i+1}$ as $f_i+\gamma$. The following proposition ensures that this greedy strategy reaches an optimal flow. + +\begin{proposition} + For any maximal flow $f$, $G_f$ contains a negative cycle if and only if there exists a maximal flow $f^*$ in $G$ such that $d(f^*, f') < d(f, f')$. +\end{proposition} +\begin{proof} + Suppose that there is such flow $f^*$. Define the oriented multigraph $M_{f,f^*}=(V,E_M)$ with the same vertex set $V$ as in $G$, and for every $v_1,v_2 \in V$, $E_M$ contains $(f^*(v_1,v_2) - f(v_1,v_2))_+$ copies of the arc $(v_1,v_2)$. For every vertex $v$, its total degree (meaning its outer degree minus its inner degree) is equal to + \begin{align*} + \deg v & = \sum_{u\in V} (f^*(v,u) - f(v,u))_+ - \sum_{u\in V} (f^*(u,v) - f(u,v))_+ \\ + & = \sum_{u\in V} f^*(v,u) - f(v,u) = \sum_{u\in V} f^*(v,u) - \sum_{u\in V} f(v,u). + \end{align*} + The last two sums are zero for any inner vertex since $f,f^*$ are flows, and they are equal on the source and sink since the two flows are both maximal and have hence the same value. Thus, $\deg v = 0$ for every vertex $v$. + + This implies that the multigraph $M_{f,f^*}$ is the union of disjoint simple cycles. $f$ can be transformed into $f^*$ by pushing a mass 1 along all these cycles in any order. Since $d(f^*, f') Date: Wed, 21 Sep 2022 14:39:59 +0200 Subject: New version of the algorithm that calculate the layout. It takes as paramters the replication factor and the zone redundancy, computes the largest partition size reachable with these constraints, and among the possible assignation with this partition size, it computes the one that moves the least number of partitions compared to the previous assignation. This computation uses graph algorithms defined in graph_algo.rs --- doc/optimal_layout_report/optimal_layout.pdf | Bin 395187 -> 395308 bytes doc/optimal_layout_report/optimal_layout.tex | 17 +- src/rpc/graph_algo.rs | 440 +++++++++++++++ src/rpc/layout.rs | 795 ++++++++++++++++----------- src/rpc/lib.rs | 2 + src/rpc/ring.rs | 1 + src/rpc/system.rs | 5 +- src/util/bipartite.rs | 363 ------------ src/util/lib.rs | 1 - 9 files changed, 926 insertions(+), 698 deletions(-) create mode 100644 src/rpc/graph_algo.rs delete mode 100644 src/util/bipartite.rs diff --git a/doc/optimal_layout_report/optimal_layout.pdf b/doc/optimal_layout_report/optimal_layout.pdf index c85803e8..0af34161 100644 Binary files a/doc/optimal_layout_report/optimal_layout.pdf and b/doc/optimal_layout_report/optimal_layout.pdf differ diff --git a/doc/optimal_layout_report/optimal_layout.tex b/doc/optimal_layout_report/optimal_layout.tex index b2898adb..005e7b50 100644 --- a/doc/optimal_layout_report/optimal_layout.tex +++ b/doc/optimal_layout_report/optimal_layout.tex @@ -100,13 +100,12 @@ Again, we will represent an assignment $\alpha$ as a flow in a specific graph $G Given some candidate size value $s$, we describe the oriented weighted graph $G=(V,E)$ with vertex set $V$ arc set $E$. The set of vertices $V$ contains the source $\mathbf{s}$, the sink $\mathbf{t}$, vertices -$\mathbf{p, p^+, p^-}$ for every partition $p$, vertices $\mathbf{x}_{p,z}$ for every partition $p$ and zone $z$, and vertices $\mathbf{n}$ for every node $n$. +$\mathbf{p^+, p^-}$ for every partition $p$, vertices $\mathbf{x}_{p,z}$ for every partition $p$ and zone $z$, and vertices $\mathbf{n}$ for every node $n$. The set of arcs $E$ contains: \begin{itemize} - \item ($\mathbf{s}$,$\mathbf{p}$, $\rho_\mathbf{N}$) for every partition $p$; - \item ($\mathbf{p}$,$\mathbf{p}^+$, $\rho_\mathbf{Z}$) for every partition $p$; - \item ($\mathbf{p}$,$\mathbf{p}^+$, $\rho_\mathbf{N}-\rho_\mathbf{Z}$) for every partition $p$; + \item ($\mathbf{s}$,$\mathbf{p}^+$, $\rho_\mathbf{Z}$) for every partition $p$; + \item ($\mathbf{s}$,$\mathbf{p}^-$, $\rho_\mathbf{N}-\rho_\mathbf{Z}$) for every partition $p$; \item ($\mathbf{p}^+$,$\mathbf{x}_{p,z}$, 1) for every partition $p$ and zone $z$; \item ($\mathbf{p}^-$,$\mathbf{x}_{p,z}$, $\rho_\mathbf{N}-\rho_\mathbf{Z}$) for every partition $p$ and zone $z$; \item ($\mathbf{x}_{p,z}$,$\mathbf{n}$, 1) for every partition $p$, zone $z$ and node $n\in z$; @@ -119,7 +118,7 @@ In the following complexity calculations, we will use the number of vertices and An assignment $\alpha$ is realizable with partition size $s$ and the redundancy constraints $(\rho_\mathbf{N},\rho_\mathbf{Z})$ if and only if there exists a maximal flow function $f$ in $G$ with total flow $\rho_\mathbf{N}P$, such that the arcs ($\mathbf{x}_{p,z}$,$\mathbf{n}$, 1) used are exactly those for which $p$ is associated to $n$ in $\alpha$. \end{proposition} \begin{proof} - Given such flow $f$, we can reconstruct a candidate $\alpha$. In $f$, the flow passing through every $\mathbf{p}$ is $\rho_\mathbf{N}$, and since the outgoing capacity of every $\mathbf{x}_{p,z}$ is 1, every partition is associated to $\rho_\mathbf{N}$ distinct nodes. The fraction $\rho_\mathbf{Z}$ of the flow passing through every $\mathbf{p^+}$ must be spread over as many distinct zones as every arc outgoing from $\mathbf{p^+}$ has capacity 1. So the reconstructed $\alpha$ verifies the redundancy constraints. For every node $n$, the flow between $\mathbf{n}$ and $\mathbf{t}$ corresponds to the number of partitions associated to $n$. By construction of $f$, this does not exceed $\lfloor c_n/s \rfloor$. We assumed that the partition size is $s$, hence this association does not exceed the storage capacity of the nodes. + Given such flow $f$, we can reconstruct a candidate $\alpha$. In $f$, the flow passing through $\mathbf{p^+}$ and $\mathbf{p^-}$ is $\rho_\mathbf{N}$, and since the outgoing capacity of every $\mathbf{x}_{p,z}$ is 1, every partition is associated to $\rho_\mathbf{N}$ distinct nodes. The fraction $\rho_\mathbf{Z}$ of the flow passing through every $\mathbf{p^+}$ must be spread over as many distinct zones as every arc outgoing from $\mathbf{p^+}$ has capacity 1. So the reconstructed $\alpha$ verifies the redundancy constraints. For every node $n$, the flow between $\mathbf{n}$ and $\mathbf{t}$ corresponds to the number of partitions associated to $n$. By construction of $f$, this does not exceed $\lfloor c_n/s \rfloor$. We assumed that the partition size is $s$, hence this association does not exceed the storage capacity of the nodes. In the other direction, given an assignment $\alpha$, one can similarly check that the facts that $\alpha$ respects the redundancy constraints, and the storage capacities of the nodes, are necessary condition to construct a maximal flow function $f$. \end{proof} @@ -272,16 +271,16 @@ The distance $d(f,f')$ is bounded by the maximal number of differences in the as The detection of negative cycle is done with the Bellman-Ford algorithm, whose complexity should normally be $O(\#E\#V)$. In our case, it amounts to $O(P^2ZN)$. Multiplied by the complexity of the outer loop, it amounts to $O(P^3ZN)$ which is a lot when the number of partitions and nodes starts to be large. To avoid that, we adapt the Bellman-Ford algorithm. -The Bellman-Ford algorithm runs $\#V$ iterations of an outer loop, and an inner loop over $E$. The idea is to compute the shortest paths from a source vertex $v$ to all other vertices. After $k$ iterations of the outer loop, the algorithm has computed all shortest path of length at most $k$. All shortest path have length at most $\#V$, so if there is an update in the last iteration of the loop, it means that there is a negative cycle in the graph. The observation that will enable us to improve the complexity is the following: +The Bellman-Ford algorithm runs $\#V$ iterations of an outer loop, and an inner loop over $E$. The idea is to compute the shortest paths from a source vertex $v$ to all other vertices. After $k$ iterations of the outer loop, the algorithm has computed all shortest path of length at most $k$. All simple paths have length at most $\#V-1$, so if there is an update in the last iteration of the loop, it means that there is a negative cycle in the graph. The observation that will enable us to improve the complexity is the following: \begin{proposition} - In the graph $G_f$ (and $G$), all simple paths and cycles have a length at most $6N$. + In the graph $G_f$ (and $G$), all simple paths have a length at most $4N$. \end{proposition} \begin{proof} - Since $f$ is a maximal flow, there is no outgoing edge from $\mathbf{s}$ in $G_f$. One can thus check than any simple path of length 6 must contain at least to node of type $\mathbf{n}$. Hence on a cycle, at most 6 arcs separate two successive nodes of type $\mathbf{n}$. + Since $f$ is a maximal flow, there is no outgoing edge from $\mathbf{s}$ in $G_f$. One can thus check than any simple path of length 4 must contain at least two node of type $\mathbf{n}$. Hence on a path, at most 4 arcs separate two successive nodes of type $\mathbf{n}$. \end{proof} -Thus, in the absence of negative cycles, shortest paths in $G_f$ have length at most $6N$. So we can do only $6N$ iterations of the outer loop in Bellman-Ford algorithm. This makes the complexity of the detection of one set of cycle to be $O(N\#E) = O(N^2 P)$. +Thus, in the absence of negative cycles, shortest paths in $G_f$ have length at most $4N$. So we can do only $4N+1$ iterations of the outer loop in Bellman-Ford algorithm. This makes the complexity of the detection of one set of cycle to be $O(N\#E) = O(N^2 P)$. With this improvement, the complexity of the whole algorithm is, in the worst case, $O(N^2P^2)$. However, since we detect several cycles at once and we start with a flow that might be close to the previous one, the number of iterations of the outer loop might be smaller in practice. diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs new file mode 100644 index 00000000..1a809b80 --- /dev/null +++ b/src/rpc/graph_algo.rs @@ -0,0 +1,440 @@ + +//! This module deals with graph algorithms. +//! It is used in layout.rs to build the partition to node assignation. + +use rand::prelude::SliceRandom; +use std::cmp::{max, min}; +use std::collections::VecDeque; +use std::collections::HashMap; + +//Vertex data structures used in all the graphs used in layout.rs. +//usize parameters correspond to node/zone/partitions ids. +//To understand the vertex roles below, please refer to the formal description +//of the layout computation algorithm. +#[derive(Clone,Copy,Debug, PartialEq, Eq, Hash)] +pub enum Vertex{ + Source, + Pup(usize), //The vertex p+ of partition p + Pdown(usize), //The vertex p- of partition p + PZ(usize,usize), //The vertex corresponding to x_(partition p, zone z) + N(usize), //The vertex corresponding to node n + Sink +} + + +//Edge data structure for the flow algorithm. +//The graph is stored as an adjacency list +#[derive(Clone, Copy, Debug)] +pub struct FlowEdge { + cap: u32, //flow maximal capacity of the edge + flow: i32, //flow value on the edge + dest: usize, //destination vertex id + rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v +} + +//Edge data structure for the detection of negative cycles. +//The graph is stored as a list of edges (u,v). +#[derive(Clone, Copy, Debug)] +pub struct WeightedEdge { + w: i32, //weight of the edge + dest: usize, +} + +pub trait Edge: Clone + Copy {} +impl Edge for FlowEdge {} +impl Edge for WeightedEdge {} + +//Struct for the graph structure. We do encapsulation here to be able to both +//provide user friendly Vertex enum to address vertices, and to use usize indices +//and Vec instead of HashMap in the graph algorithm to optimize execution speed. +pub struct Graph{ + vertextoid : HashMap, + idtovertex : Vec, + + graph : Vec< Vec > +} + +pub type CostFunction = HashMap<(Vertex,Vertex), i32>; + +impl Graph{ + pub fn new(vertices : &[Vertex]) -> Self { + let mut map = HashMap::::new(); + for i in 0..vertices.len() { + map.insert(vertices[i] , i); + } + return Graph:: { + vertextoid : map, + idtovertex: vertices.to_vec(), + graph : vec![Vec::< E >::new(); vertices.len() ] + } + } +} + +impl Graph{ + //This function adds a directed edge to the graph with capacity c, and the + //corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v:Vertex, c: u32) -> Result<(), String>{ + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push( FlowEdge{cap: c , dest: idv , flow: 0, rev : rev_v} ); + self.graph[idv].push( FlowEdge{cap: 0 , dest: idu , flow: 0, rev : rev_u} ); + Ok(()) + } + + //This function returns the list of vertices that receive a positive flow from + //vertex v. + pub fn get_positive_flow_from(&self , v:Vertex) -> Result< Vec , String>{ + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = Vec::::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.idtovertex[edge.dest]); + } + } + return Ok(result); + } + + + //This function returns the value of the flow incoming to v. + pub fn get_inflow(&self , v:Vertex) -> Result< i32 , String>{ + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0,self.graph[edge.dest][edge.rev].flow); + } + return Ok(result); + } + + //This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self , v:Vertex) -> Result< i32 , String>{ + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0,edge.flow); + } + return Ok(result); + } + + //This function computes the flow total value by computing the outgoing flow + //from the source. + pub fn get_flow_value(&mut self) -> Result { + return self.get_outflow(Vertex::Source); + } + + //This function shuffles the order of the edge lists. It keeps the ids of the + //reversed edges consistent. + fn shuffle_edges(&mut self) { + let mut rng = rand::thread_rng(); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + //Computes an upper bound of the flow n the graph + pub fn flow_upper_bound(&self) -> u32{ + let idsource = self.vertextoid[&Vertex::Source]; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter(){ + flow_upper_bound += edge.cap; + } + return flow_upper_bound; + } + + //This function computes the maximal flow using Dinic's algorithm. It starts with + //the flow values already present in the graph. So it is possible to add some edge to + //the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + if !self.vertextoid.contains_key(&Vertex::Source) { + return Err("The graph does not contain a source.".to_string()); + } + if !self.vertextoid.contains_key(&Vertex::Sink) { + return Err("The graph does not contain a sink.".to_string()); + } + + let idsource = self.vertextoid[&Vertex::Source]; + let idsink = self.vertextoid[&Vertex::Sink]; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound(); + + //To ensure the dispersion of the associations generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + //We run Dinic's max flow algorithm + loop { + //We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while !fifo.is_empty() { + if let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { //it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i32 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + } + if level[idsink] == None { + //There is no residual flow + break; + } + + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + lifo.push_back((idsource, flow_upper_bound)); + + while let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == idsink { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while !lifo.is_empty() { + if let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i32; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i32; + } + } + lifo.push_back((idsource, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min(f, self.graph[id][nbd].cap - self.graph[id][nbd].flow as u32 ); + if let (Some(lvldest), Some(lvlid)) = + (level[self.graph[id][nbd].dest], level[id]){ + if lvldest <= lvlid || new_flow == 0 { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + //otherwise, we send flow to nbd. + lifo.push_back((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + //This function takes a flow, and a cost function on the edges, and tries to find an + // equivalent flow with a better cost, by finding improving overflow cycles. It uses + // as subroutine the Bellman Ford algorithm run up to path_length. + // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only + // one needs to be present in the cost function. + pub fn optimize_flow_with_cost(&mut self , cost: &CostFunction, path_length: usize ) + -> Result<(),String>{ + + //We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while cycles.len() > 0 { + //we enumerate negative cycles + for c in cycles.iter(){ + for i in 0..c.len(){ + //We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertextoid[&c[i]]; + let idv = self.vertextoid[&c[(i+1)%c.len()]]; + for j in 0..self.graph[idu].len(){ + //since idu appears at most once in the cycles, we enumerate every + //edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -=1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + return Ok(()); + } + + //Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self , cost: &CostFunction) -> Result,String>{ + + let mut g = Graph::::new(&self.idtovertex); + let nb_vertices = self.idtovertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i32 -edge.flow > 0 { + //It is possible to send overflow through this edge + let u = self.idtovertex[i]; + let v = self.idtovertex[edge.dest]; + if cost.contains_key(&(u,v)) { + g.add_edge(u,v, cost[&(u,v)])?; + } + else if cost.contains_key(&(v,u)) { + g.add_edge(u,v, -cost[&(v,u)])?; + } + else{ + g.add_edge(u,v, 0)?; + } + } + } + } + return Ok(g); + + } + + +} + +impl Graph{ + //This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v:Vertex, w: i32) -> Result<(), String>{ + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + self.graph[idu].push( WeightedEdge{w: w , dest: idv} ); + Ok(()) + } + + //This function lists the negative cycles it manages to find after path_length + //iterations of the main loop of the Bellman-Ford algorithm. For the classical + //algorithm, path_length needs to be equal to the number of vertices. However, + //for particular graph structures like our case, the algorithm is still correct + //when path_length is the length of the longest possible simple path. + //See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec< Vec > { + + let nb_vertices = self.graph.len(); + + //We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0 ; nb_vertices]; + //The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length +1 { + for id in 0..nb_vertices{ + for e in self.graph[id].iter(){ + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + //If self.graph contains a negative cycle, then at this point the graph described + //by prev (which is a directed 1-forest/functional graph) + //must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + //Remark that the cycle in prev is in the reverse order compared to the cycle + //in the graph. Thus the .rev(). + return cycles_prev.iter().map(|cycle| cycle.iter().rev().map( + |id| self.idtovertex[*id] + ).collect() ).collect(); + } + +} + + +//This function returns the list of cycles of a directed 1 forest. It does not +//check for the consistency of the input. +fn cycles_of_1_forest(forest: &[Option]) -> Vec> { + let mut cycles = Vec::>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len(){ + let mut id = t; + //while we are on a valid undiscovered node + while time_of_discovery[id] == None { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } + else{ + break; + } + } + if forest[id] != None && time_of_discovery[id] == Some(t) { + //We discovered an id that we explored at this iteration t. + //It means we are on a cycle + let mut cy = vec![id; 1]; + let id2 = id; + while let Some(id2) = forest[id2] { + if id2 != id { + cy.push(id2); + } + else { + break; + } + } + cycles.push(cy); + } + } + return cycles; +} + + +//==================================================================================== +//==================================================================================== +//==================================================================================== +//==================================================================================== +//==================================================================================== +//==================================================================================== + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_flow() { + let left_vec = vec![3; 8]; + let right_vec = vec![0, 4, 8, 4, 8]; + //There are asserts in the function that computes the flow + } + + //maybe add tests relative to the matching optilization ? +} diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 40f97368..ff60ce98 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,17 +1,23 @@ -use std::cmp::min; use std::cmp::Ordering; use std::collections::HashMap; +use std::collections::HashSet; + +use hex::ToHex; use serde::{Deserialize, Serialize}; -use garage_util::bipartite::*; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; -use rand::prelude::SliceRandom; +use crate::graph_algo::*; use crate::ring::*; +use std::convert::TryInto; + +//The Message type will be used to collect information on the algorithm. +type Message = Vec; + /// The layout of the cluster, i.e. the list of roles /// which are assigned to each cluster node #[derive(Clone, Debug, Serialize, Deserialize)] @@ -19,12 +25,21 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, + #[serde(default="default_one")] + pub zone_redundancy: usize, + + //This attribute is only used to retain the previously computed partition size, + //to know to what extent does it change with the layout update. + #[serde(default="default_zero")] + pub partition_size: u32, + pub roles: LwwMap, /// node_id_vec: a vector of node IDs with a role assigned /// in the system (this includes gateway nodes). /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers + /// 1. non-gateway nodes are first so that they have lower numbers holding + /// in u8 (the number of non-gateway nodes is at most 256). /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, @@ -38,6 +53,15 @@ pub struct ClusterLayout { pub staging_hash: Hash, } +fn default_one() -> usize{ + return 1; +} +fn default_zero() -> u32{ + return 0; +} + +const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -66,16 +90,31 @@ impl NodeRole { None => "gateway".to_string(), } } + + pub fn tags_string(&self) -> String { + let mut tags = String::new(); + if self.tags.len() == 0 { + return tags + } + tags.push_str(&self.tags[0].clone()); + for t in 1..self.tags.len(){ + tags.push_str(","); + tags.push_str(&self.tags[t].clone()); + } + return tags; + } } impl ClusterLayout { - pub fn new(replication_factor: usize) -> Self { + pub fn new(replication_factor: usize, zone_redundancy: usize) -> Self { let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); ClusterLayout { version: 0, replication_factor, + zone_redundancy, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), @@ -122,6 +161,44 @@ impl ClusterLayout { } } + ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn useful_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => () + } + } + return result; + } + + ///Given a node uuids, this function returns the label of its zone + pub fn get_node_zone(&self, uuid : &Uuid) -> Result { + match self.node_role(uuid) { + Some(role) => return Ok(role.zone.clone()), + _ => return Err("The Uuid does not correspond to a node present in the cluster.".to_string()) + } + } + + ///Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), + _ => return Err("The Uuid does not correspond to a node present in the cluster or this node does not have a positive capacity.".to_string()) + } + } + + ///Returns the sum of capacities of non gateway nodes in the cluster + pub fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.useful_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + return Ok(total_capacity); + } + + /// Check a cluster layout for internal consistency /// returns true if consistent, false if error pub fn check(&self) -> bool { @@ -168,342 +245,412 @@ impl ClusterLayout { true } +} + +impl ClusterLayout { /// This function calculates a new partition-to-node assignation. - /// The computed assignation maximizes the capacity of a + /// The computed assignation respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of - /// data to be moved. A heuristic ensures node triplets - /// dispersion (in garage_util::bipartite::optimize_matching()). - pub fn calculate_partition_assignation(&mut self) -> bool { + /// data to be moved. + pub fn calculate_partition_assignation(&mut self, replication:usize, redundancy:usize) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - let old_node_assignation = self.update_nodes_and_ring(); - - let (node_zone, _) = self.get_node_zone_capacity(); - - //We compute the optimal number of partition to assign to - //every node and zone. - if let Some((part_per_nod, part_per_zone)) = self.optimal_proportions() { - //We collect part_per_zone in a vec to not rely on the - //arbitrary order in which elements are iterated in - //Hashmap::iter() - let part_per_zone_vec = part_per_zone - .iter() - .map(|(x, y)| (x.clone(), *y)) - .collect::>(); - //We create an indexing of the zones - let mut zone_id = HashMap::::new(); - for (i, ppz) in part_per_zone_vec.iter().enumerate() { - zone_id.insert(ppz.0.clone(), i); - } - - //We compute a candidate for the new partition to zone - //assignation. - let nb_zones = part_per_zone.len(); - let nb_nodes = part_per_nod.len(); - let nb_partitions = 1 << PARTITION_BITS; - let left_cap_vec = vec![self.replication_factor as u32; nb_partitions]; - let right_cap_vec = part_per_zone_vec.iter().map(|(_, y)| *y as u32).collect(); - let mut zone_assignation = dinic_compute_matching(left_cap_vec, right_cap_vec); - - //We create the structure for the partition-to-node assignation. - let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; - //We will decrement part_per_nod to keep track of the number - //of partitions that we still have to associate. - let mut part_per_nod = part_per_nod; - - //We minimize the distance to the former assignation(if any) - - //We get the id of the zones of the former assignation - //(and the id no_zone if there is no node assignated) - let no_zone = part_per_zone_vec.len(); - let old_zone_assignation: Vec> = old_node_assignation - .iter() - .map(|x| { - x.iter() - .map(|id| match *id { - Some(i) => zone_id[&node_zone[i]], - None => no_zone, - }) - .collect() - }) - .collect(); - - //We minimize the distance to the former zone assignation - zone_assignation = - optimize_matching(&old_zone_assignation, &zone_assignation, nb_zones + 1); //+1 for no_zone - - //We need to assign partitions to nodes in their zone - //We first put the nodes assignation that can stay the same - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if let Some(Some(former_node)) = old_node_assignation[i].iter().find(|x| { - if let Some(id) = x { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - } else { - false - } - }) { - if part_per_nod[*former_node] > 0 { - node_assignation[i][j] = Some(*former_node); - part_per_nod[*former_node] -= 1; - } - } - } - } - - //We complete the assignation of partitions to nodes - let mut rng = rand::thread_rng(); - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - if node_assignation[i][j] == None { - let possible_nodes: Vec = (0..nb_nodes) - .filter(|id| { - zone_id[&node_zone[*id]] == zone_assignation[i][j] - && part_per_nod[*id] > 0 - }) - .collect(); - assert!(!possible_nodes.is_empty()); - //We randomly pick a node - if let Some(nod) = possible_nodes.choose(&mut rng) { - node_assignation[i][j] = Some(*nod); - part_per_nod[*nod] -= 1; - } - } - } - } - - //We write the assignation in the 1D table - self.ring_assignation_data = Vec::::new(); - for ass in node_assignation { - for nod in ass { - if let Some(id) = nod { - self.ring_assignation_data.push(id as CompactNodeType); - } else { - panic!() - } - } - } - - true - } else { - false - } - } + + //We update the node ids, since the node list might have changed with the staged + //changes in the layout. We retrieve the old_assignation reframed with the new ids + let old_assignation_opt = self.update_node_id_vec()?; + self.replication_factor = replication; + self.zone_redundancy = redundancy; + + let mut msg = Message::new(); + msg.push(format!("Computation of a new cluster layout where partitions are + replicated {} times on at least {} distinct zones.", replication, redundancy)); + + //We generate for once numerical ids for the zone, to use them as indices in the + //flow graphs. + let (id_to_zone , zone_to_id) = self.generate_zone_ids()?; + + msg.push(format!("The cluster contains {} nodes spread over {} zones.", + self.useful_nodes().len(), id_to_zone.len())); + + //We compute the optimal partition size + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + if old_assignation_opt != None { + msg.push(format!("Given the replication and redundancy constraint, the + optimal size of a partition is {}. In the previous layout, it used to + be {}.", partition_size, self.partition_size)); + } + else { + msg.push(format!("Given the replication and redundancy constraints, the + optimal size of a partition is {}.", partition_size)); + } + self.partition_size = partition_size; + + //We compute a first flow/assignment that is heuristically close to the previous + //assignment + let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; + + if let Some(assoc) = &old_assignation_opt { + //We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; + } + + msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); + + //We update the layout structure + self.update_ring_from_flow(id_to_zone.len() , &gflow)?; + return Ok(msg); + } /// The LwwMap of node roles might have changed. This function updates the node_id_vec /// and returns the assignation given by ring, with the new indices of the nodes, and - /// None of the node is not present anymore. + /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_nodes_and_ring(&mut self) -> Vec>> { - let nb_partitions = 1usize << PARTITION_BITS; - let mut node_assignation = vec![vec![None; self.replication_factor]; nb_partitions]; - let rf = self.replication_factor; - let ring = &self.ring_assignation_data; - - let new_node_id_vec: Vec = self.roles.items().iter().map(|(k, _, _)| *k).collect(); - - if ring.len() == rf * nb_partitions { - for i in 0..nb_partitions { - for j in 0..self.replication_factor { - node_assignation[i][j] = new_node_id_vec - .iter() - .position(|id| *id == self.node_id_vec[ring[i * rf + j] as usize]); - } - } - } - - self.node_id_vec = new_node_id_vec; - self.ring_assignation_data = vec![]; - node_assignation + fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,String> { + // (1) We compute the new node list + //Non gateway nodes should be coded on 8bits, hence they must be first in the list + //We build the new node ids + let mut new_non_gateway_nodes: Vec = self.roles.items().iter() + .filter(|(_, _, v)| + match &v.0 {Some(r) if r.capacity != None => true, _=> false }) + .map(|(k, _, _)| *k).collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(format!("There are more than {} non-gateway nodes in the new layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + } + + let mut new_gateway_nodes: Vec = self.roles.items().iter() + .filter(|(_, _, v)| + match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) + .map(|(k, _, _)| *k).collect(); + + let nb_useful_nodes = new_non_gateway_nodes.len(); + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.append(&mut new_non_gateway_nodes); + new_node_id_vec.append(&mut new_gateway_nodes); + + + // (2) We retrieve the old association + //We rewrite the old association with the new indices. We only consider partition + //to node assignations where the node is still in use. + let nb_partitions = 1usize << PARTITION_BITS; + let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; + + if self.ring_assignation_data.len() == 0 { + //This is a new association + return Ok(None); + } + if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + return Err("The old assignation does not have a size corresponding to the old replication factor or the number of partitions.".to_string()); + } + + //We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + //We add the indices of only the new non-gateway nodes that can be used in the + //association ring + for i in 0..nb_useful_nodes { + uuid_to_new_id.insert(new_node_id_vec[i], i ); + } + + let rf= self.replication_factor; + for p in 0..nb_partitions { + for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { + let uuid = self.node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assignation[p].push(uuid_to_new_id[&uuid]); + } + } + } + + //We write the results + self.node_id_vec = new_node_id_vec; + self.ring_assignation_data = Vec::::new(); + + return Ok(Some(old_assignation)); } - ///This function compute the number of partition to assign to - ///every node and zone, so that every partition is replicated - ///self.replication_factor times and the capacity of a partition - ///is maximized. - fn optimal_proportions(&mut self) -> Option<(Vec, HashMap)> { - let mut zone_capacity: HashMap = HashMap::new(); - - let (node_zone, node_capacity) = self.get_node_zone_capacity(); - let nb_nodes = self.node_id_vec.len(); - - for i in 0..nb_nodes { - if zone_capacity.contains_key(&node_zone[i]) { - zone_capacity.insert( - node_zone[i].clone(), - zone_capacity[&node_zone[i]] + node_capacity[i], - ); - } else { - zone_capacity.insert(node_zone[i].clone(), node_capacity[i]); - } - } - - //Compute the optimal number of partitions per zone - let sum_capacities: u32 = zone_capacity.values().sum(); - - if sum_capacities == 0 { - println!("No storage capacity in the network."); - return None; - } - - let nb_partitions = 1 << PARTITION_BITS; - //Initially we would like to use zones porportionally to - //their capacity. - //However, a large zone can be associated to at most - //nb_partitions to ensure replication of the date. - //So we take the min with nb_partitions: - let mut part_per_zone: HashMap = zone_capacity - .iter() - .map(|(k, v)| { - ( - k.clone(), - min( - nb_partitions, - (self.replication_factor * nb_partitions * *v as usize) - / sum_capacities as usize, - ), - ) - }) - .collect(); - - //The replication_factor-1 upper bounds the number of - //part_per_zones that are greater than nb_partitions - for _ in 1..self.replication_factor { - //The number of partitions that are not assignated to - //a zone that takes nb_partitions. - let sum_capleft: u32 = zone_capacity - .keys() - .filter(|k| part_per_zone[*k] < nb_partitions) - .map(|k| zone_capacity[k]) - .sum(); - - //The number of replication of the data that we need - //to ensure. - let repl_left = self.replication_factor - - part_per_zone - .values() - .filter(|x| **x == nb_partitions) - .count(); - if repl_left == 0 { - break; - } - - for k in zone_capacity.keys() { - if part_per_zone[k] != nb_partitions { - part_per_zone.insert( - k.to_string(), - min( - nb_partitions, - (nb_partitions * zone_capacity[k] as usize * repl_left) - / sum_capleft as usize, - ), - ); - } - } - } - - //Now we divide the zone's partition share proportionally - //between their nodes. - - let mut part_per_nod: Vec = (0..nb_nodes) - .map(|i| { - (part_per_zone[&node_zone[i]] * node_capacity[i] as usize) - / zone_capacity[&node_zone[i]] as usize - }) - .collect(); - - //We must update the part_per_zone to make it correspond to - //part_per_nod (because of integer rounding) - part_per_zone = part_per_zone.iter().map(|(k, _)| (k.clone(), 0)).collect(); - for i in 0..nb_nodes { - part_per_zone.insert( - node_zone[i].clone(), - part_per_zone[&node_zone[i]] + part_per_nod[i], - ); - } - - //Because of integer rounding, the total sum of part_per_nod - //might not be replication_factor*nb_partitions. - // We need at most to add 1 to every non maximal value of - // part_per_nod. The capacity of a partition will be bounded - // by the minimal value of - // node_capacity_vec[i]/part_per_nod[i] - // so we try to maximize this minimal value, keeping the - // part_per_zone capped - - let discrepancy: usize = - nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); - - //We use a stupid O(N^2) algorithm. If the number of nodes - //is actually expected to be high, one should optimize this. - - for _ in 0..discrepancy { - if let Some(idmax) = (0..nb_nodes) - .filter(|i| part_per_zone[&node_zone[*i]] < nb_partitions) - .max_by(|i, j| { - (node_capacity[*i] * (part_per_nod[*j] + 1) as u32) - .cmp(&(node_capacity[*j] * (part_per_nod[*i] + 1) as u32)) - }) { - part_per_nod[idmax] += 1; - part_per_zone.insert( - node_zone[idmax].clone(), - part_per_zone[&node_zone[idmax]] + 1, - ); - } - } - - //We check the algorithm consistency - - let discrepancy: usize = - nb_partitions * self.replication_factor - part_per_nod.iter().sum::(); - assert!(discrepancy == 0); - assert!(if let Some(v) = part_per_zone.values().max() { - *v <= nb_partitions - } else { - false - }); - - Some((part_per_nod, part_per_zone)) - } - - //Returns vectors of zone and capacity; indexed by the same (temporary) - //indices as node_id_vec. - fn get_node_zone_capacity(&self) -> (Vec, Vec) { - let node_zone = self - .node_id_vec - .iter() - .map(|id_nod| match self.node_role(id_nod) { - Some(NodeRole { - zone, - capacity: _, - tags: _, - }) => zone.clone(), - _ => "".to_string(), - }) - .collect(); - - let node_capacity = self - .node_id_vec - .iter() - .map(|id_nod| match self.node_role(id_nod) { - Some(NodeRole { - zone: _, - capacity: Some(c), - tags: _, - }) => *c, - _ => 0, - }) - .collect(); - - (node_zone, node_capacity) - } + ///This function generates ids for the zone of the nodes appearing in + ///self.node_id_vec. + fn generate_zone_ids(&self) -> Result<(Vec, HashMap),String>{ + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.node_id_vec.iter() { + if self.roles.get(uuid) == None { + return Err("The uuid was not found in the node roles (this should not happen, it might be a critical error).".to_string()); + } + match self.node_role(&uuid) { + Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone() , id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + _ => () + } + } + return Ok((id_to_zone, zone_to_id)); + } + + ///This function computes by dichotomy the largest realizable partition size, given + ///the layout. + fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ + let nb_partitions = 1usize << PARTITION_BITS; + let empty_set = HashSet::<(usize,usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { + return Err("The storage capacity of he cluster is to small. It is impossible to store partitions of size 1.".to_string()); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down +1 < s_up { + g = self.generate_flow_graph((s_down+s_up)/2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { + s_up = (s_down+s_up)/2; + } + else { + s_down = (s_down+s_up)/2; + } + } + + return Ok(s_down); + } + + fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + return vertices; + } + + fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, String> { + let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), + self.useful_nodes().len()); + let mut g= Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), self.zone_redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - self.zone_redundancy) as u32)?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; + g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , + self.replication_factor as u32)?; + } + } + for n in 0..self.useful_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity/size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p,n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + return Ok(g); + } + + + fn compute_candidate_assignment(&self, zone_to_id: &HashMap, + old_assoc_opt : &Option >>) -> Result, String > { + + //We list the edges that are not used in the old association + let mut exclude_edge = HashSet::<(usize,usize)>::new(); + if let Some(old_assoc) = old_assoc_opt { + let nb_nodes = self.useful_nodes().len(); + for p in 0..NB_PARTITIONS { + for n in 0..nb_nodes { + exclude_edge.insert((p,n)); + } + for n in old_assoc[p].iter() { + exclude_edge.remove(&(p,*n)); + } + } + } + + //We compute the best flow using only the edges used in the old assoc + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge )?; + g.compute_maximal_flow()?; + for (p,n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + return Ok(g); + } + + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), String > { + let mut cost = CostFunction::new(); + for p in 0..NB_PARTITIONS { + for n in old_assoc[p].iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); + } + } + let nb_nodes = self.useful_nodes().len(); + let path_length = 4*nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + return Ok(()); + } + + fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), String>{ + self.ring_assignation_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; + for vertex in assoc_vertex.iter() { + match vertex{ + Vertex::N(n) => self.ring_assignation_data.push((*n).try_into().unwrap()), + _ => () + } + } + } + } + + if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { + return Err("Critical Error : the association ring we produced does not have the right size.".to_string()); + } + return Ok(()); + } + + + //This function returns a message summing up the partition repartition of the new + //layout. + fn output_stat(&self , gflow : &Graph, + old_assoc_opt : &Option< Vec> >, + zone_to_id: &HashMap, + id_to_zone : &Vec) -> Result{ + let mut msg = Message::new(); + + let nb_partitions = 1usize << PARTITION_BITS; + let used_cap = self.partition_size * nb_partitions as u32 * + self.replication_factor as u32; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); + msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + used_cap , total_cap , percent_cap )); + msg.push(format!("If the percentage is to low, it might be that the replication/redundancy constraints force the use of nodes/zones with small storage capacities. + You might want to rebalance the storage capacities or relax the constraints. See the detailed statistics below and look for saturated nodes/zones.")); + msg.push(format!("Recall that because of the replication, the actual available storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); + + //We define and fill in the following tables + let storing_nodes = self.useful_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..nb_partitions { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; + if pz_nodes.len() > 0 { + stored_partitions_zone[z] += 1; + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(old_assoc) = old_assoc_opt { + if !old_assoc[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p.push( + zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + } + + //We display the statistics + + if *old_assoc_opt != None { + let total_new_partitions : usize = new_partitions.iter().sum(); + msg.push(format!("A total of {} new copies of partitions need to be \ + transferred.", total_new_partitions)); + } + msg.push(format!("")); + msg.push(format!("Detailed statistics by zones and nodes.")); + + for z in 0..id_to_zone.len(){ + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len(){ + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions : usize = nodes_of_z.iter() + .map(|n| stored_partitions[*n]).sum(); + msg.push(format!("")); + + if *old_assoc_opt != None { + msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], + new_partitions_zone[z], replicated_partitions)); + } + else{ + msg.push(format!("Zone {}: {} distinct partitions stored ({} partition \ + copies) ", + id_to_zone[z], stored_partitions_zone[z], replicated_partitions)); + } + + let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); + msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", + available_cap_z, total_cap_z, percent_cap_z)); + msg.push(format!("")); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; + let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self.node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))?.tags_string(); + msg.push(format!(" Node {}: {} partitions ({} new) ; \ + available/total capacity: {} / {} ({:.1}%) ; tags:{}", + &self.node_id_vec[*n].to_vec().encode_hex::(), + stored_partitions[*n], + new_partitions[*n], available_cap_n, total_cap_n, + (available_cap_n as f32)/(total_cap_n as f32)*100.0 , + tags_n)); + } + } + + return Ok(msg); + } + } +//==================================================================================== + #[cfg(test)] mod tests { use super::*; diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 392ff48f..1036a8e1 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -8,9 +8,11 @@ mod consul; mod kubernetes; pub mod layout; +pub mod graph_algo; pub mod ring; pub mod system; + mod metrics; pub mod rpc_helper; diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs index 73a126a2..743a5cba 100644 --- a/src/rpc/ring.rs +++ b/src/rpc/ring.rs @@ -40,6 +40,7 @@ pub struct Ring { // Type to store compactly the id of a node in the system // Change this to u16 the day we want to have more than 256 nodes in a cluster pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; // The maximum number of times an object might get replicated // This must be at least 3 because Garage supports 3-way replication diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 68d94ea5..313671ca 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -97,6 +97,7 @@ pub struct System { kubernetes_discovery: Option, replication_factor: usize, + zone_redundancy: usize, /// The ring pub ring: watch::Receiver>, @@ -192,6 +193,7 @@ impl System { network_key: NetworkKey, background: Arc, replication_factor: usize, + zone_redundancy: usize, config: &Config, ) -> Arc { let node_key = @@ -211,7 +213,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor) + ClusterLayout::new(replication_factor, zone_redundancy) } }; @@ -285,6 +287,7 @@ impl System { rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), system_endpoint, replication_factor, + zone_redundancy, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, bootstrap_peers: config.bootstrap_peers.clone(), diff --git a/src/util/bipartite.rs b/src/util/bipartite.rs deleted file mode 100644 index 1e1e9caa..00000000 --- a/src/util/bipartite.rs +++ /dev/null @@ -1,363 +0,0 @@ -/* - * This module deals with graph algorithm in complete bipartite - * graphs. It is used in layout.rs to build the partition to node - * assignation. - * */ - -use rand::prelude::SliceRandom; -use std::cmp::{max, min}; -use std::collections::VecDeque; - -//Graph data structure for the flow algorithm. -#[derive(Clone, Copy, Debug)] -struct EdgeFlow { - c: i32, - flow: i32, - v: usize, - rev: usize, -} - -//Graph data structure for the detection of positive cycles. -#[derive(Clone, Copy, Debug)] -struct WeightedEdge { - w: i32, - u: usize, - v: usize, -} - -/* This function takes two matchings (old_match and new_match) in a - * complete bipartite graph. It returns a matching that has the - * same degree as new_match at every vertex, and that is as close - * as possible to old_match. - * */ -pub fn optimize_matching( - old_match: &[Vec], - new_match: &[Vec], - nb_right: usize, -) -> Vec> { - let nb_left = old_match.len(); - let ed = WeightedEdge { w: -1, u: 0, v: 0 }; - let mut edge_vec = vec![ed; nb_left * nb_right]; - - //We build the complete bipartite graph structure, represented - //by the list of all edges. - for i in 0..nb_left { - for j in 0..nb_right { - edge_vec[i * nb_right + j].u = i; - edge_vec[i * nb_right + j].v = nb_left + j; - } - } - - for i in 0..edge_vec.len() { - //We add the old matchings - if old_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { - edge_vec[i].w *= -1; - } - //We add the new matchings - if new_match[edge_vec[i].u].contains(&(edge_vec[i].v - nb_left)) { - (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - //Now edge_vec is a graph where edges are oriented LR if we - //can add them to new_match, and RL otherwise. If - //adding/removing them makes the matching closer to old_match - //they have weight 1; and -1 otherwise. - - //We shuffle the edge list so that there is no bias depending in - //partitions/zone label in the triplet dispersion - let mut rng = rand::thread_rng(); - edge_vec.shuffle(&mut rng); - - //Discovering and flipping a cycle with positive weight in this - //graph will make the matching closer to old_match. - //We use Bellman Ford algorithm to discover positive cycles - while let Some(cycle) = positive_cycle(&edge_vec, nb_left, nb_right) { - for i in cycle { - //We flip the edges of the cycle. - (edge_vec[i].u, edge_vec[i].v) = (edge_vec[i].v, edge_vec[i].u); - edge_vec[i].w *= -1; - } - } - - //The optimal matching is build from the graph structure. - let mut matching = vec![Vec::::new(); nb_left]; - for e in edge_vec { - if e.u > e.v { - matching[e.v].push(e.u - nb_left); - } - } - matching -} - -//This function finds a positive cycle in a bipartite wieghted graph. -fn positive_cycle( - edge_vec: &[WeightedEdge], - nb_left: usize, - nb_right: usize, -) -> Option> { - let nb_side_min = min(nb_left, nb_right); - let nb_vertices = nb_left + nb_right; - let weight_lowerbound = -((nb_left + nb_right) as i32) - 1; - let mut accessed = vec![false; nb_left]; - - //We try to find a positive cycle accessible from the left - //vertex i. - for i in 0..nb_left { - if accessed[i] { - continue; - } - let mut weight = vec![weight_lowerbound; nb_vertices]; - let mut prev = vec![edge_vec.len(); nb_vertices]; - weight[i] = 0; - //We compute largest weighted paths from i. - //Since the graph is bipartite, any simple cycle has length - //at most 2*nb_side_min. In the general Bellman-Ford - //algorithm, the bound here is the number of vertices. Since - //the number of partitions can be much larger than the - //number of nodes, we optimize that. - for _ in 0..(2 * nb_side_min) { - for (j, e) in edge_vec.iter().enumerate() { - if weight[e.v] < weight[e.u] + e.w { - weight[e.v] = weight[e.u] + e.w; - prev[e.v] = j; - } - } - } - //We update the accessed table - for i in 0..nb_left { - if weight[i] > weight_lowerbound { - accessed[i] = true; - } - } - //We detect positive cycle - for e in edge_vec { - if weight[e.v] < weight[e.u] + e.w { - //it means e is on a path branching from a positive cycle - let mut was_seen = vec![false; nb_vertices]; - let mut curr = e.u; - //We track back with prev until we reach the cycle. - while !was_seen[curr] { - was_seen[curr] = true; - curr = edge_vec[prev[curr]].u; - } - //Now curr is on the cycle. We collect the edges ids. - let mut cycle = vec![prev[curr]]; - let mut cycle_vert = edge_vec[prev[curr]].u; - while cycle_vert != curr { - cycle.push(prev[cycle_vert]); - cycle_vert = edge_vec[prev[cycle_vert]].u; - } - - return Some(cycle); - } - } - } - - None -} - -// This function takes two arrays of capacity and computes the -// maximal matching in the complete bipartite graph such that the -// left vertex i is matched to left_cap_vec[i] right vertices, and -// the right vertex j is matched to right_cap_vec[j] left vertices. -// To do so, we use Dinic's maximum flow algorithm. -pub fn dinic_compute_matching(left_cap_vec: Vec, right_cap_vec: Vec) -> Vec> { - let mut graph = Vec::>::new(); - let ed = EdgeFlow { - c: 0, - flow: 0, - v: 0, - rev: 0, - }; - - // 0 will be the source - graph.push(vec![ed; left_cap_vec.len()]); - for (i, c) in left_cap_vec.iter().enumerate() { - graph[0][i].c = *c as i32; - graph[0][i].v = i + 2; - graph[0][i].rev = 0; - } - - //1 will be the sink - graph.push(vec![ed; right_cap_vec.len()]); - for (i, c) in right_cap_vec.iter().enumerate() { - graph[1][i].c = *c as i32; - graph[1][i].v = i + 2 + left_cap_vec.len(); - graph[1][i].rev = 0; - } - - //we add left vertices - for i in 0..left_cap_vec.len() { - graph.push(vec![ed; 1 + right_cap_vec.len()]); - graph[i + 2][0].c = 0; //directed - graph[i + 2][0].v = 0; - graph[i + 2][0].rev = i; - - for j in 0..right_cap_vec.len() { - graph[i + 2][j + 1].c = 1; - graph[i + 2][j + 1].v = 2 + left_cap_vec.len() + j; - graph[i + 2][j + 1].rev = i + 1; - } - } - - //we add right vertices - for i in 0..right_cap_vec.len() { - let lft_ln = left_cap_vec.len(); - graph.push(vec![ed; 1 + lft_ln]); - graph[i + lft_ln + 2][0].c = graph[1][i].c; - graph[i + lft_ln + 2][0].v = 1; - graph[i + lft_ln + 2][0].rev = i; - - for j in 0..left_cap_vec.len() { - graph[i + 2 + lft_ln][j + 1].c = 0; //directed - graph[i + 2 + lft_ln][j + 1].v = j + 2; - graph[i + 2 + lft_ln][j + 1].rev = i + 1; - } - } - - //To ensure the dispersion of the triplets generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //left vertices do not consider the right ones in the same order. - let mut rng = rand::thread_rng(); - for i in 0..graph.len() { - graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. - for j in 0..graph[i].len() { - let target_v = graph[i][j].v; - let target_rev = graph[i][j].rev; - graph[target_v][target_rev].rev = j; - } - } - - let nb_vertices = graph.len(); - - //We run Dinic's max flow algorithm - loop { - //We build the level array from Dinic's algorithm. - let mut level = vec![-1; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((0, 0)); - while !fifo.is_empty() { - if let Some((id, lvl)) = fifo.pop_front() { - if level[id] == -1 { - level[id] = lvl; - for e in graph[id].iter() { - if e.c - e.flow > 0 { - fifo.push_back((e.v, lvl + 1)); - } - } - } - } - } - if level[1] == -1 { - //There is no residual flow - break; - } - - //Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); - - let flow_upper_bound = if let Some(x) = left_cap_vec.iter().max() { - *x as i32 - } else { - panic!(); - }; - - lifo.push_back((0, flow_upper_bound)); - - while let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == 1 { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - graph[id][nbd].flow += f; - let id_v = graph[id][nbd].v; - let nbd_v = graph[id][nbd].rev; - graph[id_v][nbd_v].flow -= f; - } - } - lifo.push_back((0, flow_upper_bound)); - continue; - } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { - next_nbd[*parent] += 1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f, graph[id][nbd].c - graph[id][nbd].flow); - if level[graph[id][nbd].v] <= level[id] || new_flow == 0 { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - //otherwise, we send flow to nbd. - lifo.push_back((graph[id][nbd].v, new_flow)); - } - } - - //We return the association - let assoc_table = (0..left_cap_vec.len()) - .map(|id| { - graph[id + 2] - .iter() - .filter(|e| e.flow > 0) - .map(|e| e.v - 2 - left_cap_vec.len()) - .collect() - }) - .collect(); - - //consistency check - - //it is a flow - for i in 3..graph.len() { - assert!(graph[i].iter().map(|e| e.flow).sum::() == 0); - for e in graph[i].iter() { - assert!(e.flow + graph[e.v][e.rev].flow == 0); - } - } - - //it solves the matching problem - for i in 0..left_cap_vec.len() { - assert!(left_cap_vec[i] as i32 == graph[i + 2].iter().map(|e| max(0, e.flow)).sum::()); - } - for i in 0..right_cap_vec.len() { - assert!( - right_cap_vec[i] as i32 - == graph[i + 2 + left_cap_vec.len()] - .iter() - .map(|e| max(0, e.flow)) - .sum::() - ); - } - - assoc_table -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_flow() { - let left_vec = vec![3; 8]; - let right_vec = vec![0, 4, 8, 4, 8]; - //There are asserts in the function that computes the flow - let _ = dinic_compute_matching(left_vec, right_vec); - } - - //maybe add tests relative to the matching optilization ? -} diff --git a/src/util/lib.rs b/src/util/lib.rs index 891549c3..e83fc2e6 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -4,7 +4,6 @@ extern crate tracing; pub mod background; -pub mod bipartite; pub mod config; pub mod crdt; pub mod data; -- cgit v1.2.3 From bd842e1388a324e2a3956465e9b32d0dc739a8d9 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 22 Sep 2022 19:30:01 +0200 Subject: Correction of a few bugs in the tests, modification of ClusterLayout::check --- src/rpc/graph_algo.rs | 41 ++++++------ src/rpc/layout.rs | 173 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 137 insertions(+), 77 deletions(-) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 1a809b80..a5a1e4ba 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -182,7 +182,7 @@ impl Graph{ //assignation, we shuffle the neighbours of the nodes. Hence, //the vertices do not consider their neighbours in the same order. self.shuffle_edges(); - + //We run Dinic's max flow algorithm loop { //We build the level array from Dinic's algorithm. @@ -206,7 +206,6 @@ impl Graph{ //There is no residual flow break; } - //Now we run DFS respecting the level array let mut next_nbd = vec![0; nb_vertices]; let mut lifo = VecDeque::new(); @@ -220,14 +219,12 @@ impl Graph{ //The DFS reached the sink, we can add a //residual flow. lifo.pop_back(); - while !lifo.is_empty() { - if let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i32; - let id_rev = self.graph[id][nbd].dest; - let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i32; - } + while let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i32; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i32; } lifo.push_back((idsource, flow_upper_bound)); continue; @@ -243,10 +240,14 @@ impl Graph{ continue; } //else we can try to send flow from id to its nbd - let new_flow = min(f, self.graph[id][nbd].cap - self.graph[id][nbd].flow as u32 ); + let new_flow = min(f as i32, self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow) as u32; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]){ - if lvldest <= lvlid || new_flow == 0 { + if lvldest <= lvlid { //We cannot send flow to nbd. next_nbd[id] += 1; continue; @@ -266,7 +267,6 @@ impl Graph{ // one needs to be present in the cost function. pub fn optimize_flow_with_cost(&mut self , cost: &CostFunction, path_length: usize ) -> Result<(),String>{ - //We build the weighted graph g where we will look for negative cycle let mut gf = self.build_cost_graph(cost)?; let mut cycles = gf.list_negative_cycles(path_length); @@ -364,6 +364,7 @@ impl Graph{ } } + //If self.graph contains a negative cycle, then at this point the graph described //by prev (which is a directed 1-forest/functional graph) //must contain a cycle. We list the cycles of prev. @@ -401,8 +402,9 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { //We discovered an id that we explored at this iteration t. //It means we are on a cycle let mut cy = vec![id; 1]; - let id2 = id; - while let Some(id2) = forest[id2] { + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; if id2 != id { cy.push(id2); } @@ -429,12 +431,5 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { mod tests { use super::*; - #[test] - fn test_flow() { - let left_vec = vec![3; 8]; - let right_vec = vec![0, 4, 8, 4, 8]; - //There are asserts in the function that computes the flow - } - - //maybe add tests relative to the matching optilization ? } + diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index ff60ce98..a878f19c 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::collections::HashSet; use hex::ToHex; +use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -185,7 +186,8 @@ impl ClusterLayout { pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err("The Uuid does not correspond to a node present in the cluster or this node does not have a positive capacity.".to_string()) + _ => return Err("The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity.".to_string()) } } @@ -242,6 +244,47 @@ impl ClusterLayout { } } + //Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignation_data[rf*p..rf*(p+1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return false; + } + //Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p.iter() + .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.")); + if zones_of_p.unique().count() < self.zone_redundancy { + return false; + } + } + + //Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignation_data.iter() { + node_usage[*n as usize] += 1; + } + for n in 0..MAX_NODE_NUMBER { + if node_usage[n] > 0 { + let uuid = self.node_id_vec[n]; + if node_usage[n]*self.partition_size > self.get_node_capacity(&uuid) + .expect("Critical Error"){ + return false; + } + } + } + + //Check that the partition size stored is the one computed by the asignation + //algorithm. + let cl2 = self.clone(); + let (_ , zone_to_id) = cl2.generate_zone_ids().expect("Critical Error"); + let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); + if partition_size != self.partition_size { + return false; + } + + true } @@ -267,7 +310,7 @@ impl ClusterLayout { self.zone_redundancy = redundancy; let mut msg = Message::new(); - msg.push(format!("Computation of a new cluster layout where partitions are + msg.push(format!("Computation of a new cluster layout where partitions are \ replicated {} times on at least {} distinct zones.", replication, redundancy)); //We generate for once numerical ids for the zone, to use them as indices in the @@ -276,16 +319,19 @@ impl ClusterLayout { msg.push(format!("The cluster contains {} nodes spread over {} zones.", self.useful_nodes().len(), id_to_zone.len())); - + //We compute the optimal partition size + //Capacities should be given in a unit so that partition size is at least 100. + //In this case, integer rounding plays a marginal role in the percentages of + //optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; if old_assignation_opt != None { - msg.push(format!("Given the replication and redundancy constraint, the - optimal size of a partition is {}. In the previous layout, it used to + msg.push(format!("Given the replication and redundancy constraint, the \ + optimal size of a partition is {}. In the previous layout, it used to \ be {}.", partition_size, self.partition_size)); } else { - msg.push(format!("Given the replication and redundancy constraints, the + msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; @@ -293,13 +339,13 @@ impl ClusterLayout { //We compute a first flow/assignment that is heuristically close to the previous //assignment let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { //We minimize the distance to the previous assignment. self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; } msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); + msg.push("".to_string()); //We update the layout structure self.update_ring_from_flow(id_to_zone.len() , &gflow)?; @@ -321,7 +367,8 @@ impl ClusterLayout { .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(format!("There are more than {} non-gateway nodes in the new layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + return Err(format!("There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); } let mut new_gateway_nodes: Vec = self.roles.items().iter() @@ -346,7 +393,8 @@ impl ClusterLayout { return Ok(None); } if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err("The old assignation does not have a size corresponding to the old replication factor or the number of partitions.".to_string()); + return Err("The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions.".to_string()); } //We build a translation table between the uuid and new ids @@ -384,7 +432,8 @@ impl ClusterLayout { for uuid in self.node_id_vec.iter() { if self.roles.get(uuid) == None { - return Err("The uuid was not found in the node roles (this should not happen, it might be a critical error).".to_string()); + return Err("The uuid was not found in the node roles (this should \ + not happen, it might be a critical error).".to_string()); } match self.node_role(&uuid) { Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { @@ -405,7 +454,8 @@ impl ClusterLayout { let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err("The storage capacity of he cluster is to small. It is impossible to store partitions of size 1.".to_string()); + return Err("The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1.".to_string()); } let mut s_down = 1; @@ -525,11 +575,12 @@ impl ClusterLayout { } if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err("Critical Error : the association ring we produced does not have the right size.".to_string()); + return Err("Critical Error : the association ring we produced does not \ + have the right size.".to_string()); } return Ok(()); } - + //This function returns a message summing up the partition repartition of the new //layout. @@ -546,9 +597,16 @@ impl ClusterLayout { let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap , total_cap , percent_cap )); - msg.push(format!("If the percentage is to low, it might be that the replication/redundancy constraints force the use of nodes/zones with small storage capacities. - You might want to rebalance the storage capacities or relax the constraints. See the detailed statistics below and look for saturated nodes/zones.")); - msg.push(format!("Recall that because of the replication, the actual available storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); + msg.push(format!("")); + msg.push(format!("If the percentage is to low, it might be that the \ + replication/redundancy constraints force the use of nodes/zones with small \ + storage capacities. \ + You might want to rebalance the storage capacities or relax the constraints. \ + See the detailed statistics below and look for saturated nodes/zones.")); + msg.push(format!("Recall that because of the replication, the actual available \ + storage capacity is {} / {} = {}.", + used_cap , self.replication_factor , + used_cap/self.replication_factor as u32)); //We define and fill in the following tables let storing_nodes = self.useful_nodes(); @@ -563,6 +621,16 @@ impl ClusterLayout { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; if pz_nodes.len() > 0 { stored_partitions_zone[z] += 1; + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p.push( + zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } } for vert in pz_nodes.iter() { if let Vertex::N(n) = *vert { @@ -574,21 +642,17 @@ impl ClusterLayout { } } } - if let Some(old_assoc) = old_assoc_opt { - let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { - old_zones_of_p.push( - zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } } } + + if *old_assoc_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } //We display the statistics + msg.push(format!("")); if *old_assoc_opt != None { let total_new_partitions : usize = new_partitions.iter().sum(); msg.push(format!("A total of {} new copies of partitions need to be \ @@ -608,16 +672,9 @@ impl ClusterLayout { .map(|n| stored_partitions[*n]).sum(); msg.push(format!("")); - if *old_assoc_opt != None { - msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ + msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], new_partitions_zone[z], replicated_partitions)); - } - else{ - msg.push(format!("Zone {}: {} distinct partitions stored ({} partition \ - copies) ", - id_to_zone[z], stored_partitions_zone[z], replicated_partitions)); - } let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; let mut total_cap_z = 0; @@ -625,18 +682,17 @@ impl ClusterLayout { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); - msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", + msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", available_cap_z, total_cap_z, percent_cap_z)); - msg.push(format!("")); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; let tags_n = (self.node_role(&self.node_id_vec[*n]) .ok_or("Node not found."))?.tags_string(); - msg.push(format!(" Node {}: {} partitions ({} new) ; \ + msg.push(format!(" Node {}: {} partitions ({} new) ; \ available/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec().encode_hex::(), + &self.node_id_vec[*n].to_vec()[0..2].to_vec().encode_hex::(), stored_partitions[*n], new_partitions[*n], available_cap_n, total_cap_n, (available_cap_n as f32)/(total_cap_n as f32)*100.0 , @@ -654,16 +710,14 @@ impl ClusterLayout { #[cfg(test)] mod tests { use super::*; - use itertools::Itertools; - + use std::io::*; +// use itertools::Itertools; +/* fn check_assignation(cl: &ClusterLayout) { //Check that input data has the right format let nb_partitions = 1usize << PARTITION_BITS; - assert!([1, 2, 3].contains(&cl.replication_factor)); assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); - let (node_zone, node_capacity) = cl.get_node_zone_capacity(); - //Check that is is a correct assignation with zone redundancy let rf = cl.replication_factor; for i in 0..nb_partitions { @@ -743,6 +797,13 @@ mod tests { } } } +*/ + + fn show_msg(msg : &Message) { + for s in msg.iter(){ + println!("{}",s); + } + } fn update_layout( cl: &mut ClusterLayout, @@ -769,7 +830,8 @@ mod tests { #[test] fn test_assignation() { - let mut node_id_vec = vec![1, 2, 3]; + std::io::stdout().flush().ok().expect("Could not flush stdout"); + let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] .into_iter() @@ -782,14 +844,16 @@ mod tests { roles: LwwMap::new(), replication_factor: 3, + zone_redundancy: 1, + partition_size: 0, ring_assignation_data: vec![], version: 0, staging: LwwMap::new(), - staging_hash: sha256sum(&[1; 32]), + staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), }; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -798,17 +862,18 @@ mod tests { .map(|x| x.to_string()) .collect(); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + assert!(cl.check()); - node_capacity_vec = vec![4000, 4000, 2000, 7000, 1000, 9000, 2000, 10, 2000]; + node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - cl.calculate_partition_assignation(); - check_assignation(&cl); + show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + assert!(cl.check()); + } } -- cgit v1.2.3 From 99f96b9564c9c841dc6c56f1255a6e70ff884d46 Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 4 Oct 2022 18:09:24 +0200 Subject: deleted zone_redundancy from System struct --- src/rpc/system.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 313671ca..34031b10 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -97,7 +97,6 @@ pub struct System { kubernetes_discovery: Option, replication_factor: usize, - zone_redundancy: usize, /// The ring pub ring: watch::Receiver>, @@ -287,7 +286,6 @@ impl System { rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), system_endpoint, replication_factor, - zone_redundancy, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, bootstrap_peers: config.bootstrap_peers.clone(), -- cgit v1.2.3 From ceac3713d6639f9170fc3b4475fae4a30b34483c Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 5 Oct 2022 15:29:48 +0200 Subject: modifications in several files to : - have consistent error return types - store the zone redundancy in a Lww - print the error and message in the CLI (TODO: for the server Api, should msg be returned in the body response?) --- src/api/admin/cluster.rs | 7 ++- src/garage/cli/layout.rs | 35 ++++++++------ src/rpc/layout.rs | 118 +++++++++++++++++++++++++++++------------------ src/rpc/system.rs | 3 +- 4 files changed, 100 insertions(+), 63 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 99c6e332..630179b5 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -162,7 +162,12 @@ pub async fn handle_apply_cluster_layout( let param = parse_json_body::(req).await?; let layout = garage.system.get_cluster_layout(); - let layout = layout.apply_staged_changes(Some(param.version))?; + let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; + //TODO : how to display msg ? Should it be in the Body Response ? + for s in msg.iter() { + println!("{}", s); + } + garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 3884bb92..a5b838e7 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -188,19 +188,23 @@ pub async fn cmd_show_layout( // this will print the stats of what partitions // will move around when we apply - if layout.calculate_partition_assignation() { - println!("To enact the staged role changes, type:"); - println!(); - println!(" garage layout apply --version {}", layout.version + 1); - println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1 - ); - } else { - println!("Not enough nodes have an assigned role to maintain enough copies of data."); - println!("This new layout cannot yet be applied."); - } + match layout.calculate_partition_assignation() { + Ok(msg) => { + for line in msg.iter() { + println!("{}", line); + } + println!("To enact the staged role changes, type:"); + println!(); + println!(" garage layout apply --version {}", layout.version + 1); + println!(); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + layout.version + 1)}, + Err(Error::Message(s)) => { + println!("Error while trying to compute the assignation: {}", s); + println!("This new layout cannot yet be applied.");}, + _ => { println!("Unknown Error"); }, + } } Ok(()) @@ -213,7 +217,10 @@ pub async fn cmd_apply_layout( ) -> Result<(), Error> { let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.apply_staged_changes(apply_opt.version)?; + let (layout, msg) = layout.apply_staged_changes(apply_opt.version)?; + for line in msg.iter() { + println!("{}", line); + } send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 16d573c7..8d2b3e17 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; +use garage_util::crdt::{AutoCrdt, Crdt, LwwMap, Lww}; use garage_util::data::*; use garage_util::error::*; @@ -27,12 +27,10 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, - #[serde(default="default_one")] - pub zone_redundancy: usize, //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_zero")] + #[serde(default="default_partition_size")] pub partition_size: u32, pub roles: LwwMap, @@ -51,17 +49,31 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout + #[serde(default="default_layout_parameters")] + pub parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_one() -> usize{ - return 1; -} -fn default_zero() -> u32{ +fn default_partition_size() -> u32{ return 0; } +fn default_layout_parameters() -> Lww{ + Lww::::new(LayoutParameters{ zone_redundancy: 1}) +} + +///This struct is used to set the parameters to be used in the assignation computation +///algorithm. It is stored as a Crdt. +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct LayoutParameters { + pub zone_redundancy:usize, +} + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -108,18 +120,24 @@ impl NodeRole { } impl ClusterLayout { - pub fn new(replication_factor: usize, zone_redundancy: usize) -> Self { + pub fn new(replication_factor: usize) -> Self { + + //We set the default zone redundancy to be equal to the replication factor, + //i.e. as strict as possible. + let default_parameters = Lww::::new( + LayoutParameters{ zone_redundancy: replication_factor}); + let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); ClusterLayout { version: 0, replication_factor, - zone_redundancy, partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), + parameters: default_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -132,6 +150,7 @@ impl ClusterLayout { true } Ordering::Equal => { + self.parameters.merge(&other.parameters); self.staging.merge(&other.staging); let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); @@ -145,7 +164,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option) -> Result { + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self,Message), Error> { match version { None => { let error = r#" @@ -164,16 +183,14 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging); self.roles.retain(|(_, _, v)| v.0.is_some()); - if !self.calculate_partition_assignation() { - return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); - } + let msg = self.calculate_partition_assignation()?; self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); self.version += 1; - Ok(self) + Ok((self,msg)) } pub fn revert_staged_changes(mut self, version: Option) -> Result { @@ -231,24 +248,24 @@ To know the correct value of the new layout version, invoke `garage layout show` } ///Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid : &Uuid) -> Result { + pub fn get_node_zone(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(role) => return Ok(role.zone.clone()), - _ => return Err("The Uuid does not correspond to a node present in the cluster.".to_string()) + _ => return Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) } } ///Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err("The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity.".to_string()) + _ => return Err(Error::Message("The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity.".into())) } } ///Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.useful_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -311,7 +328,8 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - if zones_of_p.unique().count() < self.zone_redundancy { + let redundancy = self.parameters.get().zone_redundancy; + if zones_of_p.unique().count() < redundancy { return false; } } @@ -354,7 +372,7 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - pub fn calculate_partition_assignation(&mut self, replication:usize, redundancy:usize) -> Result { + pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. @@ -362,12 +380,12 @@ impl ClusterLayout { //We update the node ids, since the node list might have changed with the staged //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - self.replication_factor = replication; - self.zone_redundancy = redundancy; + let redundancy = self.parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ - replicated {} times on at least {} distinct zones.", replication, redundancy)); + replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); //We generate for once numerical ids for the zone, to use them as indices in the //flow graphs. @@ -381,6 +399,7 @@ impl ClusterLayout { //In this case, integer rounding plays a marginal role in the percentages of //optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ @@ -392,6 +411,12 @@ impl ClusterLayout { } self.partition_size = partition_size; + if partition_size < 100 { + msg.push("WARNING: The partition size is low (< 100), you might consider to \ + give the nodes capacities in a smaller unit (e.g. Mb instead of Gb) to \ + achieve a more tailored use of your storage ressources.".into()); + } + //We compute a first flow/assignment that is heuristically close to the previous //assignment let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; @@ -413,7 +438,7 @@ impl ClusterLayout { /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,String> { + fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,Error> { // (1) We compute the new node list //Non gateway nodes should be coded on 8bits, hence they must be first in the list //We build the new node ids @@ -423,8 +448,8 @@ impl ClusterLayout { .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER).to_string()); + return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", MAX_NODE_NUMBER).into() )); } let mut new_gateway_nodes: Vec = self.roles.items().iter() @@ -449,8 +474,8 @@ impl ClusterLayout { return Ok(None); } if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err("The old assignation does not have a size corresponding to \ - the old replication factor or the number of partitions.".to_string()); + return Err(Error::Message("The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions.".into())); } //We build a translation table between the uuid and new ids @@ -482,14 +507,14 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_zone_ids(&self) -> Result<(Vec, HashMap),String>{ + fn generate_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); for uuid in self.node_id_vec.iter() { if self.roles.get(uuid) == None { - return Err("The uuid was not found in the node roles (this should \ - not happen, it might be a critical error).".to_string()); + return Err(Error::Message("The uuid was not found in the node roles (this should \ + not happen, it might be a critical error).".into())); } match self.node_role(&uuid) { Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { @@ -504,14 +529,14 @@ impl ClusterLayout { ///This function computes by dichotomy the largest realizable partition size, given ///the layout. - fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ + fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ let nb_partitions = 1usize << PARTITION_BITS; let empty_set = HashSet::<(usize,usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err("The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1.".to_string()); + return Err(Error::Message("The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1.".into())); } let mut s_down = 1; @@ -545,14 +570,15 @@ impl ClusterLayout { return vertices; } - fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, String> { + fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); + let redundancy = self.parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), self.zone_redundancy as u32)?; - g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - self.zone_redundancy) as u32)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , @@ -574,7 +600,7 @@ impl ClusterLayout { fn compute_candidate_assignment(&self, zone_to_id: &HashMap, - old_assoc_opt : &Option >>) -> Result, String > { + old_assoc_opt : &Option >>) -> Result, Error > { //We list the edges that are not used in the old association let mut exclude_edge = HashSet::<(usize,usize)>::new(); @@ -601,7 +627,7 @@ impl ClusterLayout { return Ok(g); } - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), String > { + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), Error > { let mut cost = CostFunction::new(); for p in 0..NB_PARTITIONS { for n in old_assoc[p].iter() { @@ -616,7 +642,7 @@ impl ClusterLayout { return Ok(()); } - fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), String>{ + fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ self.ring_assignation_data = Vec::::new(); for p in 0..NB_PARTITIONS { for z in 0..nb_zones { @@ -631,8 +657,8 @@ impl ClusterLayout { } if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err("Critical Error : the association ring we produced does not \ - have the right size.".to_string()); + return Err(Error::Message("Critical Error : the association ring we produced does not \ + have the right size.".into())); } return Ok(()); } @@ -643,7 +669,7 @@ impl ClusterLayout { fn output_stat(&self , gflow : &Graph, old_assoc_opt : &Option< Vec> >, zone_to_id: &HashMap, - id_to_zone : &Vec) -> Result{ + id_to_zone : &Vec) -> Result{ let mut msg = Message::new(); let nb_partitions = 1usize << PARTITION_BITS; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 7eb25195..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -196,7 +196,6 @@ impl System { network_key: NetworkKey, background: Arc, replication_factor: usize, - zone_redundancy: usize, config: &Config, ) -> Result, Error> { let node_key = @@ -226,7 +225,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor, zone_redundancy) + ClusterLayout::new(replication_factor) } }; -- cgit v1.2.3 From a951b6c45273e59b98f974937aebb8ada8816ab8 Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 5 Oct 2022 16:04:19 +0200 Subject: Added a CLI command to update the parameters for the layout computation (for now, only the zone redundancy) --- src/garage/cli/layout.rs | 35 +++++++++++++++++++++++++++++++++-- src/garage/cli/structs.rs | 14 +++++++++++++- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index a5b838e7..6b86e46d 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -14,8 +14,8 @@ pub async fn cli_layout_command_dispatch( rpc_host: NodeID, ) -> Result<(), Error> { match cmd { - LayoutOperation::Assign(configure_opt) => { - cmd_assign_role(system_rpc_endpoint, rpc_host, configure_opt).await + LayoutOperation::Assign(assign_opt) => { + cmd_assign_role(system_rpc_endpoint, rpc_host, assign_opt).await } LayoutOperation::Remove(remove_opt) => { cmd_remove_role(system_rpc_endpoint, rpc_host, remove_opt).await @@ -27,6 +27,9 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Revert(revert_opt) => { cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await } + LayoutOperation::Config(config_opt) => { + cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await + } } } @@ -245,6 +248,34 @@ pub async fn cmd_revert_layout( Ok(()) } +pub async fn cmd_config_layout( + rpc_cli: &Endpoint, + rpc_host: NodeID, + config_opt: ConfigLayoutOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + match config_opt.redundancy { + None => (), + Some(r) => { + if r > layout.replication_factor { + println!("The zone redundancy must be smaller or equal to the \ + replication factor ({}).", layout.replication_factor); + } + else if r < 1 { + println!("The zone redundancy must be at least 1."); + } + else { + layout.parameters.update(LayoutParameters{ zone_redundancy: r }); + println!("The new zone redundancy has been staged."); + } + } + } + + send_layout(rpc_cli, rpc_host, layout).await?; + Ok(()) +} + // --- utility --- pub async fn fetch_layout( diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 06548e89..896379bb 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -86,6 +86,10 @@ pub enum LayoutOperation { /// Remove role from Garage cluster node #[structopt(name = "remove", version = garage_version())] Remove(RemoveRoleOpt), + + /// Configure parameters value for the layout computation + #[structopt(name = "config", version = garage_version())] + Config(ConfigLayoutOpt), /// Show roles currently assigned to nodes and changes staged for commit #[structopt(name = "show", version = garage_version())] @@ -100,6 +104,7 @@ pub enum LayoutOperation { Revert(RevertLayoutOpt), } + #[derive(StructOpt, Debug)] pub struct AssignRoleOpt { /// Node(s) to which to assign role (prefix of hexadecimal node id) @@ -110,7 +115,7 @@ pub struct AssignRoleOpt { #[structopt(short = "z", long = "zone")] pub(crate) zone: Option, - /// Capacity (in relative terms, use 1 to represent your smallest server) + /// Capacity (in relative terms) #[structopt(short = "c", long = "capacity")] pub(crate) capacity: Option, @@ -133,6 +138,13 @@ pub struct RemoveRoleOpt { pub(crate) node_id: String, } +#[derive(StructOpt, Debug)] +pub struct ConfigLayoutOpt { + /// Zone redundancy parameter + #[structopt(short = "r", long = "redundancy")] + pub(crate) redundancy: Option, +} + #[derive(StructOpt, Debug)] pub struct ApplyLayoutOpt { /// Version number of new configuration: this command will fail if -- cgit v1.2.3 From 9407df60cc00fc70c10f73bc4b600085789d5353 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 12:54:51 +0200 Subject: Corrected two bugs: - self.node_id_vec was not properly updated when the previous ring was empty - ClusterLayout::merge was not considering changes in the layout parameters --- src/garage/cli/layout.rs | 6 +++++- src/rpc/layout.rs | 56 ++++++++++++++++++++++++++++++------------------ src/rpc/system.rs | 1 + 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 6b86e46d..9e5bdaea 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -188,6 +188,10 @@ pub async fn cmd_show_layout( println!("No nodes have a role in the new layout."); } println!(); + + println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); + println!("Zone redundancy: {}", layout.parameters.get().zone_redundancy); + println!(); // this will print the stats of what partitions // will move around when we apply @@ -267,7 +271,7 @@ pub async fn cmd_config_layout( } else { layout.parameters.update(LayoutParameters{ zone_redundancy: r }); - println!("The new zone redundancy has been staged."); + println!("The new zone redundancy has been saved ({}).", r); } } } diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 8d2b3e17..89c18c68 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -150,15 +150,17 @@ impl ClusterLayout { true } Ordering::Equal => { + let param_changed = self.parameters.get() != other.parameters.get(); self.parameters.merge(&other.parameters); self.staging.merge(&other.staging); + let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - let changed = new_staging_hash != self.staging_hash; + let stage_changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; - changed + stage_changed || param_changed } Ordering::Less => false, } @@ -352,7 +354,7 @@ To know the correct value of the new layout version, invoke `garage layout show` //Check that the partition size stored is the one computed by the asignation //algorithm. let cl2 = self.clone(); - let (_ , zone_to_id) = cl2.generate_zone_ids().expect("Critical Error"); + let (_ , zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); if partition_size != self.partition_size { return false; @@ -371,13 +373,14 @@ impl ClusterLayout { /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of - /// data to be moved. + /// data to be moved. + /// Staged changes must be merged with nodes roles before calling this function. pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - - //We update the node ids, since the node list might have changed with the staged + + //We update the node ids, since the node role list might have changed with the //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; @@ -387,12 +390,23 @@ impl ClusterLayout { msg.push(format!("Computation of a new cluster layout where partitions are \ replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); - //We generate for once numerical ids for the zone, to use them as indices in the - //flow graphs. - let (id_to_zone , zone_to_id) = self.generate_zone_ids()?; + //We generate for once numerical ids for the zones of non gateway nodes, + //to use them as indices in the flow graphs. + let (id_to_zone , zone_to_id) = self.generate_useful_zone_ids()?; + let nb_useful_nodes = self.useful_nodes().len(); msg.push(format!("The cluster contains {} nodes spread over {} zones.", - self.useful_nodes().len(), id_to_zone.len())); + nb_useful_nodes, id_to_zone.len())); + if nb_useful_nodes < self.replication_factor{ + return Err(Error::Message(format!("The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_useful_nodes, self.replication_factor))); + } + if id_to_zone.len() < redundancy { + return Err(Error::Message(format!("The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len() , redundancy))); + } //We compute the optimal partition size //Capacities should be given in a unit so that partition size is at least 100. @@ -413,8 +427,7 @@ impl ClusterLayout { if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ - give the nodes capacities in a smaller unit (e.g. Mb instead of Gb) to \ - achieve a more tailored use of your storage ressources.".into()); + provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb).".into()); } //We compute a first flow/assignment that is heuristically close to the previous @@ -456,12 +469,14 @@ impl ClusterLayout { .filter(|(_, _, v)| match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) .map(|(k, _, _)| *k).collect(); - + let nb_useful_nodes = new_non_gateway_nodes.len(); let mut new_node_id_vec = Vec::::new(); new_node_id_vec.append(&mut new_non_gateway_nodes); new_node_id_vec.append(&mut new_gateway_nodes); + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); // (2) We retrieve the old association //We rewrite the old association with the new indices. We only consider partition @@ -490,15 +505,14 @@ impl ClusterLayout { let rf= self.replication_factor; for p in 0..nb_partitions { for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { - let uuid = self.node_id_vec[*old_id as usize]; + let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { old_assignation[p].push(uuid_to_new_id[&uuid]); } } } - //We write the results - self.node_id_vec = new_node_id_vec; + //We write the ring self.ring_assignation_data = Vec::::new(); return Ok(Some(old_assignation)); @@ -507,11 +521,11 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ + fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - - for uuid in self.node_id_vec.iter() { + + for uuid in self.useful_nodes().iter() { if self.roles.get(uuid) == None { return Err(Error::Message("The uuid was not found in the node roles (this should \ not happen, it might be a critical error).".into())); @@ -685,7 +699,7 @@ impl ClusterLayout { storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ See the detailed statistics below and look for saturated nodes/zones.")); - msg.push(format!("Recall that because of the replication, the actual available \ + msg.push(format!("Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", used_cap , self.replication_factor , used_cap/self.replication_factor as u32)); @@ -741,7 +755,7 @@ impl ClusterLayout { transferred.", total_new_partitions)); } msg.push(format!("")); - msg.push(format!("Detailed statistics by zones and nodes.")); + msg.push(format!("==== DETAILED STATISTICS BY ZONES AND NODES ====")); for z in 0..id_to_zone.len(){ let mut nodes_of_z = Vec::::new(); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 9e0bfa11..655d21de 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,6 +565,7 @@ impl System { return Err(Error::Message(msg)); } + let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From 911eb17bd9e25f2f02fbe1de81a3384e99ea13ac Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 14:53:57 +0200 Subject: corrected warnings of cargo clippy --- src/rpc/graph_algo.rs | 26 ++++++------ src/rpc/layout.rs | 111 ++++++++++++++++++++++++-------------------------- 2 files changed, 66 insertions(+), 71 deletions(-) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index a5a1e4ba..4e27631a 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -59,10 +59,10 @@ pub type CostFunction = HashMap<(Vertex,Vertex), i32>; impl Graph{ pub fn new(vertices : &[Vertex]) -> Self { let mut map = HashMap::::new(); - for i in 0..vertices.len() { - map.insert(vertices[i] , i); + for (i, vert) in vertices.iter().enumerate(){ + map.insert(*vert , i); } - return Graph:: { + Graph:: { vertextoid : map, idtovertex: vertices.to_vec(), graph : vec![Vec::< E >::new(); vertices.len() ] @@ -99,7 +99,7 @@ impl Graph{ result.push(self.idtovertex[edge.dest]); } } - return Ok(result); + Ok(result) } @@ -113,7 +113,7 @@ impl Graph{ for edge in self.graph[idv].iter() { result += max(0,self.graph[edge.dest][edge.rev].flow); } - return Ok(result); + Ok(result) } //This function returns the value of the flow outgoing from v. @@ -126,13 +126,13 @@ impl Graph{ for edge in self.graph[idv].iter() { result += max(0,edge.flow); } - return Ok(result); + Ok(result) } //This function computes the flow total value by computing the outgoing flow //from the source. pub fn get_flow_value(&mut self) -> Result { - return self.get_outflow(Vertex::Source); + self.get_outflow(Vertex::Source) } //This function shuffles the order of the edge lists. It keeps the ids of the @@ -157,7 +157,7 @@ impl Graph{ for edge in self.graph[idsource].iter(){ flow_upper_bound += edge.cap; } - return flow_upper_bound; + flow_upper_bound } //This function computes the maximal flow using Dinic's algorithm. It starts with @@ -270,7 +270,7 @@ impl Graph{ //We build the weighted graph g where we will look for negative cycle let mut gf = self.build_cost_graph(cost)?; let mut cycles = gf.list_negative_cycles(path_length); - while cycles.len() > 0 { + while !cycles.is_empty() { //we enumerate negative cycles for c in cycles.iter(){ for i in 0..c.len(){ @@ -293,7 +293,7 @@ impl Graph{ gf = self.build_cost_graph(cost)?; cycles = gf.list_negative_cycles(path_length); } - return Ok(()); + Ok(()) } //Construct the weighted graph G_f from the flow and the cost function @@ -319,7 +319,7 @@ impl Graph{ } } } - return Ok(g); + Ok(g) } @@ -334,7 +334,7 @@ impl Graph{ } let idu = self.vertextoid[&u]; let idv = self.vertextoid[&v]; - self.graph[idu].push( WeightedEdge{w: w , dest: idv} ); + self.graph[idu].push( WeightedEdge{ w , dest: idv} ); Ok(()) } @@ -415,7 +415,7 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { cycles.push(cy); } } - return cycles; + cycles } diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 89c18c68..1969b721 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -56,7 +56,7 @@ pub struct ClusterLayout { } fn default_partition_size() -> u32{ - return 0; + 0 } fn default_layout_parameters() -> Lww{ @@ -107,15 +107,15 @@ impl NodeRole { pub fn tags_string(&self) -> String { let mut tags = String::new(); - if self.tags.len() == 0 { + if self.tags.is_empty() { return tags } tags.push_str(&self.tags[0].clone()); for t in 1..self.tags.len(){ - tags.push_str(","); + tags.push(','); tags.push_str(&self.tags[t].clone()); } - return tags; + tags } } @@ -246,22 +246,22 @@ To know the correct value of the new layout version, invoke `garage layout show` _ => () } } - return result; + result } ///Given a node uuids, this function returns the label of its zone pub fn get_node_zone(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { - Some(role) => return Ok(role.zone.clone()), - _ => return Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) } } ///Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { match self.node_role(uuid) { - Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => return Ok(*cap), - _ => return Err(Error::Message("The Uuid does not correspond to a node present in the \ + Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => Ok(*cap), + _ => Err(Error::Message("The Uuid does not correspond to a node present in the \ cluster or this node does not have a positive capacity.".into())) } } @@ -272,7 +272,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for uuid in self.useful_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; } - return Ok(total_capacity); + Ok(total_capacity) } @@ -341,10 +341,10 @@ To know the correct value of the new layout version, invoke `garage layout show` for n in self.ring_assignation_data.iter() { node_usage[*n as usize] += 1; } - for n in 0..MAX_NODE_NUMBER { - if node_usage[n] > 0 { + for (n, usage) in node_usage.iter().enumerate(){ + if *usage > 0 { let uuid = self.node_id_vec[n]; - if node_usage[n]*self.partition_size > self.get_node_capacity(&uuid) + if usage*self.partition_size > self.get_node_capacity(&uuid) .expect("Critical Error"){ return false; } @@ -435,7 +435,7 @@ impl ClusterLayout { let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { //We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, &assoc)?; + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); @@ -443,7 +443,7 @@ impl ClusterLayout { //We update the layout structure self.update_ring_from_flow(id_to_zone.len() , &gflow)?; - return Ok(msg); + Ok(msg) } /// The LwwMap of node roles might have changed. This function updates the node_id_vec @@ -456,21 +456,18 @@ impl ClusterLayout { //Non gateway nodes should be coded on 8bits, hence they must be first in the list //We build the new node ids let mut new_non_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| - match &v.0 {Some(r) if r.capacity != None => true, _=> false }) + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) .map(|(k, _, _)| *k).collect(); if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER).into() )); + layout. This is not allowed.", MAX_NODE_NUMBER) )); } let mut new_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| - match v {NodeRoleV(Some(r)) if r.capacity == None => true, _=> false }) + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) .map(|(k, _, _)| *k).collect(); - let nb_useful_nodes = new_non_gateway_nodes.len(); let mut new_node_id_vec = Vec::::new(); new_node_id_vec.append(&mut new_non_gateway_nodes); new_node_id_vec.append(&mut new_gateway_nodes); @@ -484,7 +481,7 @@ impl ClusterLayout { let nb_partitions = 1usize << PARTITION_BITS; let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; - if self.ring_assignation_data.len() == 0 { + if self.ring_assignation_data.is_empty() { //This is a new association return Ok(None); } @@ -498,16 +495,16 @@ impl ClusterLayout { //We add the indices of only the new non-gateway nodes that can be used in the //association ring - for i in 0..nb_useful_nodes { - uuid_to_new_id.insert(new_node_id_vec[i], i ); + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i ); } let rf= self.replication_factor; - for p in 0..nb_partitions { + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { let uuid = old_node_id_vec[*old_id as usize]; if uuid_to_new_id.contains_key(&uuid) { - old_assignation[p].push(uuid_to_new_id[&uuid]); + old_assign_p.push(uuid_to_new_id[&uuid]); } } } @@ -515,7 +512,7 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); - return Ok(Some(old_assignation)); + Ok(Some(old_assignation)) } @@ -530,15 +527,14 @@ impl ClusterLayout { return Err(Error::Message("The uuid was not found in the node roles (this should \ not happen, it might be a critical error).".into())); } - match self.node_role(&uuid) { - Some(r) => if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone() , id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - _ => () + if let Some(r) = self.node_role(uuid) { + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone() , id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } } } - return Ok((id_to_zone, zone_to_id)); + Ok((id_to_zone, zone_to_id)) } ///This function computes by dichotomy the largest realizable partition size, given @@ -566,7 +562,7 @@ impl ClusterLayout { } } - return Ok(s_down); + Ok(s_down) } fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { @@ -581,7 +577,7 @@ impl ClusterLayout { for n in 0..nb_nodes { vertices.push(Vertex::N(n)); } - return vertices; + vertices } fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { @@ -609,7 +605,7 @@ impl ClusterLayout { } } } - return Ok(g); + Ok(g) } @@ -620,11 +616,11 @@ impl ClusterLayout { let mut exclude_edge = HashSet::<(usize,usize)>::new(); if let Some(old_assoc) = old_assoc_opt { let nb_nodes = self.useful_nodes().len(); - for p in 0..NB_PARTITIONS { + for (p, old_assoc_p) in old_assoc.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p,n)); } - for n in old_assoc[p].iter() { + for n in old_assoc_p.iter() { exclude_edge.remove(&(p,*n)); } } @@ -638,13 +634,13 @@ impl ClusterLayout { g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; - return Ok(g); + Ok(g) } - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &Vec< Vec >) -> Result<(), Error > { + fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &[Vec ]) -> Result<(), Error > { let mut cost = CostFunction::new(); - for p in 0..NB_PARTITIONS { - for n in old_assoc[p].iter() { + for (p, assoc_p) in old_assoc.iter().enumerate(){ + for n in assoc_p.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); } @@ -653,7 +649,7 @@ impl ClusterLayout { let path_length = 4*nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; - return Ok(()); + Ok(()) } fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ @@ -662,9 +658,8 @@ impl ClusterLayout { for z in 0..nb_zones { let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; for vertex in assoc_vertex.iter() { - match vertex{ - Vertex::N(n) => self.ring_assignation_data.push((*n).try_into().unwrap()), - _ => () + if let Vertex::N(n) = vertex { + self.ring_assignation_data.push((*n).try_into().unwrap()); } } } @@ -674,7 +669,7 @@ impl ClusterLayout { return Err(Error::Message("Critical Error : the association ring we produced does not \ have the right size.".into())); } - return Ok(()); + Ok(()) } @@ -683,7 +678,7 @@ impl ClusterLayout { fn output_stat(&self , gflow : &Graph, old_assoc_opt : &Option< Vec> >, zone_to_id: &HashMap, - id_to_zone : &Vec) -> Result{ + id_to_zone : &[String]) -> Result{ let mut msg = Message::new(); let nb_partitions = 1usize << PARTITION_BITS; @@ -693,12 +688,12 @@ impl ClusterLayout { let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap , total_cap , percent_cap )); - msg.push(format!("")); - msg.push(format!("If the percentage is to low, it might be that the \ + msg.push("".into()); + msg.push("If the percentage is to low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones.")); + See the detailed statistics below and look for saturated nodes/zones.".into()); msg.push(format!("Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", used_cap , self.replication_factor , @@ -715,7 +710,7 @@ impl ClusterLayout { for p in 0..nb_partitions { for z in 0..id_to_zone.len() { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - if pz_nodes.len() > 0 { + if !pz_nodes.is_empty() { stored_partitions_zone[z] += 1; if let Some(old_assoc) = old_assoc_opt { let mut old_zones_of_p = Vec::::new(); @@ -748,14 +743,14 @@ impl ClusterLayout { //We display the statistics - msg.push(format!("")); + msg.push("".into()); if *old_assoc_opt != None { let total_new_partitions : usize = new_partitions.iter().sum(); msg.push(format!("A total of {} new copies of partitions need to be \ transferred.", total_new_partitions)); } - msg.push(format!("")); - msg.push(format!("==== DETAILED STATISTICS BY ZONES AND NODES ====")); + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); for z in 0..id_to_zone.len(){ let mut nodes_of_z = Vec::::new(); @@ -766,7 +761,7 @@ impl ClusterLayout { } let replicated_partitions : usize = nodes_of_z.iter() .map(|n| stored_partitions[*n]).sum(); - msg.push(format!("")); + msg.push("".into()); msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], @@ -796,7 +791,7 @@ impl ClusterLayout { } } - return Ok(msg); + Ok(msg) } } -- cgit v1.2.3 From fcf9ac674a2842b2b55d933e60af5af93dcc4592 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:19:25 +0200 Subject: Tests written in layout.rs added staged_parameters to ClusterLayout removed the serde(default) -> will need a migration function --- src/db/lib.rs | 2 +- src/garage/cli/layout.rs | 4 +- src/rpc/graph_algo.rs | 14 --- src/rpc/layout.rs | 232 +++++++++++++++++++++-------------------------- 4 files changed, 107 insertions(+), 145 deletions(-) diff --git a/src/db/lib.rs b/src/db/lib.rs index d96586be..af539494 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -3,7 +3,7 @@ extern crate tracing; #[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); +//compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); #[cfg(feature = "lmdb")] pub mod lmdb_adapter; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 9e5bdaea..32f637eb 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -190,7 +190,7 @@ pub async fn cmd_show_layout( println!(); println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!("Zone redundancy: {}", layout.parameters.get().zone_redundancy); + println!("Zone redundancy: {}", layout.staged_parameters.get().zone_redundancy); println!(); // this will print the stats of what partitions @@ -270,7 +270,7 @@ pub async fn cmd_config_layout( println!("The zone redundancy must be at least 1."); } else { - layout.parameters.update(LayoutParameters{ zone_redundancy: r }); + layout.staged_parameters.update(LayoutParameters{ zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } } diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 4e27631a..70ccf35a 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -419,17 +419,3 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { } -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== -//==================================================================================== - - -#[cfg(test)] -mod tests { - use super::*; - -} - diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 1969b721..976f94af 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -30,8 +30,8 @@ pub struct ClusterLayout { //This attribute is only used to retain the previously computed partition size, //to know to what extent does it change with the layout update. - #[serde(default="default_partition_size")] pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -49,20 +49,11 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - #[serde(default="default_layout_parameters")] - pub parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } -fn default_partition_size() -> u32{ - 0 -} - -fn default_layout_parameters() -> Lww{ - Lww::::new(LayoutParameters{ zone_redundancy: 1}) -} - ///This struct is used to set the parameters to be used in the assignation computation ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] @@ -124,8 +115,8 @@ impl ClusterLayout { //We set the default zone redundancy to be equal to the replication factor, //i.e. as strict as possible. - let default_parameters = Lww::::new( - LayoutParameters{ zone_redundancy: replication_factor}); + let parameters = LayoutParameters{ zone_redundancy: replication_factor}; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -137,7 +128,8 @@ impl ClusterLayout { roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters: default_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -150,8 +142,8 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.parameters.get() != other.parameters.get(); - self.parameters.merge(&other.parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); @@ -330,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let zones_of_p = nodes_of_p.iter() .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.")); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; if zones_of_p.unique().count() < redundancy { return false; } @@ -384,7 +376,8 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; + let mut msg = Message::new(); msg.push(format!("Computation of a new cluster layout where partitions are \ @@ -417,13 +410,15 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!("Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {}.", partition_size, self.partition_size)); + be {} (the zone redundancy was {}).", partition_size, self.partition_size, + self.parameters.zone_redundancy)); } else { msg.push(format!("Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", partition_size)); } self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push("WARNING: The partition size is low (< 100), you might consider to \ @@ -511,6 +506,10 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); + } Ok(Some(old_assignation)) } @@ -585,7 +584,7 @@ impl ClusterLayout { self.useful_nodes().len()); let mut g= Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.parameters.get().zone_redundancy; + let redundancy = self.staged_parameters.get().zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; @@ -800,96 +799,80 @@ impl ClusterLayout { #[cfg(test)] mod tests { - use super::*; - use std::io::*; -// use itertools::Itertools; -/* - fn check_assignation(cl: &ClusterLayout) { - //Check that input data has the right format - let nb_partitions = 1usize << PARTITION_BITS; - assert!(cl.ring_assignation_data.len() == nb_partitions * cl.replication_factor); - - //Check that is is a correct assignation with zone redundancy - let rf = cl.replication_factor; - for i in 0..nb_partitions { - assert!( - rf == cl.ring_assignation_data[rf * i..rf * (i + 1)] - .iter() - .map(|nod| node_zone[*nod as usize].clone()) - .unique() - .count() - ); - } + use super::{*,Error}; + use std::cmp::min; + + + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size +1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } - let nb_nodes = cl.node_id_vec.len(); - //Check optimality - let node_nb_part = (0..nb_nodes) - .map(|i| { - cl.ring_assignation_data - .iter() - .filter(|x| **x == i as u8) - .count() - }) - .collect::>(); + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated - let zone_vec = node_zone.iter().unique().collect::>(); - let zone_nb_part = zone_vec - .iter() - .map(|z| { - cl.ring_assignation_data - .iter() - .filter(|x| node_zone[**x as usize] == **z) - .count() - }) - .collect::>(); + let mut id_zone_token = vec![0; zones.len()]; + for (z,t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } - //Check optimality of the zone assignation : would it be better for the - //node_capacity/node_partitions ratio to change the assignation of a partition - - if let Some(idmin) = (0..nb_nodes).min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = (0..nb_nodes) - .filter(|i| { - if let Some(p) = zone_vec.iter().position(|z| **z == node_zone[*i]) { - zone_nb_part[p] < nb_partitions - } else { - false - } - }) - .max_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 || + (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } - //In every zone, check optimality of the nod assignation - for z in zone_vec { - let node_of_z_iter = (0..nb_nodes).filter(|id| node_zone[*id] == *z); - if let Some(idmin) = node_of_z_iter.clone().min_by(|i, j| { - (node_capacity[*i] * node_nb_part[*j] as u32) - .cmp(&(node_capacity[*j] * node_nb_part[*i] as u32)) - }) { - if let Some(idnew) = node_of_z_iter.min_by(|i, j| { - (node_capacity[*i] * (node_nb_part[*j] as u32 + 1)) - .cmp(&(node_capacity[*j] * (node_nb_part[*i] as u32 + 1))) - }) { - assert!( - node_capacity[idmin] * (node_nb_part[idnew] as u32 + 1) - >= node_capacity[idnew] * node_nb_part[idmin] as u32 - ); - } - } - } - } -*/ - fn show_msg(msg : &Message) { for s in msg.iter(){ println!("{}",s); @@ -901,6 +884,7 @@ mod tests { node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, + zone_redundancy: usize ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -917,11 +901,11 @@ mod tests { ); cl.roles.merge(&update); } + cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); } #[test] fn test_assignation() { - std::io::stdout().flush().ok().expect("Could not flush stdout"); let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] @@ -929,22 +913,11 @@ mod tests { .map(|x| x.to_string()) .collect(); - let mut cl = ClusterLayout { - node_id_vec: vec![], - - roles: LwwMap::new(), - - replication_factor: 3, - zone_redundancy: 1, - partition_size: 0, - ring_assignation_data: vec![], - version: 0, - staging: LwwMap::new(), - staging_hash: blake2sum(&rmp_to_vec_all_named(&LwwMap::::new()).unwrap()[..]), - }; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + let mut cl = ClusterLayout::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; @@ -952,19 +925,22 @@ mod tests { .into_iter() .map(|x| x.to_string()) .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,3).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec); - show_msg(&cl.calculate_partition_assignation(3,1).unwrap()); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); + assert!(matches!(check_against_naive(&cl), Ok(true))); } } -- cgit v1.2.3 From 4abab246f1113a9a1988fdfca81c1dd8ffa323c8 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:21:13 +0200 Subject: cargo fmt --- src/api/admin/cluster.rs | 8 +- src/db/lib.rs | 1 - src/garage/cli/layout.rs | 94 ++-- src/garage/cli/structs.rs | 7 +- src/rpc/graph_algo.rs | 754 ++++++++++++------------- src/rpc/layout.rs | 1332 ++++++++++++++++++++++++--------------------- src/rpc/lib.rs | 3 +- src/rpc/system.rs | 1 - 8 files changed, 1162 insertions(+), 1038 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 630179b5..da3d8c44 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -163,10 +163,10 @@ pub async fn handle_apply_cluster_layout( let layout = garage.system.get_cluster_layout(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - //TODO : how to display msg ? Should it be in the Body Response ? - for s in msg.iter() { - println!("{}", s); - } + //TODO : how to display msg ? Should it be in the Body Response ? + for s in msg.iter() { + println!("{}", s); + } garage.system.update_cluster_layout(&layout).await?; diff --git a/src/db/lib.rs b/src/db/lib.rs index af539494..0a776a91 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -4,7 +4,6 @@ extern crate tracing; #[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] //compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); - #[cfg(feature = "lmdb")] pub mod lmdb_adapter; #[cfg(feature = "sled")] diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 32f637eb..f747fbe4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -27,9 +27,9 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Revert(revert_opt) => { cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await } - LayoutOperation::Config(config_opt) => { - cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await - } + LayoutOperation::Config(config_opt) => { + cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await + } } } @@ -188,30 +188,37 @@ pub async fn cmd_show_layout( println!("No nodes have a role in the new layout."); } println!(); - + println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!("Zone redundancy: {}", layout.staged_parameters.get().zone_redundancy); + println!( + "Zone redundancy: {}", + layout.staged_parameters.get().zone_redundancy + ); println!(); // this will print the stats of what partitions // will move around when we apply - match layout.calculate_partition_assignation() { - Ok(msg) => { - for line in msg.iter() { - println!("{}", line); - } - println!("To enact the staged role changes, type:"); - println!(); - println!(" garage layout apply --version {}", layout.version + 1); - println!(); - println!( + match layout.calculate_partition_assignation() { + Ok(msg) => { + for line in msg.iter() { + println!("{}", line); + } + println!("To enact the staged role changes, type:"); + println!(); + println!(" garage layout apply --version {}", layout.version + 1); + println!(); + println!( "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1)}, - Err(Error::Message(s)) => { - println!("Error while trying to compute the assignation: {}", s); - println!("This new layout cannot yet be applied.");}, - _ => { println!("Unknown Error"); }, - } + layout.version + 1) + } + Err(Error::Message(s)) => { + println!("Error while trying to compute the assignation: {}", s); + println!("This new layout cannot yet be applied."); + } + _ => { + println!("Unknown Error"); + } + } } Ok(()) @@ -225,9 +232,9 @@ pub async fn cmd_apply_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; let (layout, msg) = layout.apply_staged_changes(apply_opt.version)?; - for line in msg.iter() { - println!("{}", line); - } + for line in msg.iter() { + println!("{}", line); + } send_layout(rpc_cli, rpc_host, layout).await?; @@ -258,26 +265,29 @@ pub async fn cmd_config_layout( config_opt: ConfigLayoutOpt, ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - - match config_opt.redundancy { - None => (), - Some(r) => { - if r > layout.replication_factor { - println!("The zone redundancy must be smaller or equal to the \ - replication factor ({}).", layout.replication_factor); - } - else if r < 1 { - println!("The zone redundancy must be at least 1."); - } - else { - layout.staged_parameters.update(LayoutParameters{ zone_redundancy: r }); - println!("The new zone redundancy has been saved ({}).", r); - } - } - } + + match config_opt.redundancy { + None => (), + Some(r) => { + if r > layout.replication_factor { + println!( + "The zone redundancy must be smaller or equal to the \ + replication factor ({}).", + layout.replication_factor + ); + } else if r < 1 { + println!("The zone redundancy must be at least 1."); + } else { + layout + .staged_parameters + .update(LayoutParameters { zone_redundancy: r }); + println!("The new zone redundancy has been saved ({}).", r); + } + } + } send_layout(rpc_cli, rpc_host, layout).await?; - Ok(()) + Ok(()) } // --- utility --- diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 896379bb..02ed8992 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -86,10 +86,10 @@ pub enum LayoutOperation { /// Remove role from Garage cluster node #[structopt(name = "remove", version = garage_version())] Remove(RemoveRoleOpt), - - /// Configure parameters value for the layout computation + + /// Configure parameters value for the layout computation #[structopt(name = "config", version = garage_version())] - Config(ConfigLayoutOpt), + Config(ConfigLayoutOpt), /// Show roles currently assigned to nodes and changes staged for commit #[structopt(name = "show", version = garage_version())] @@ -104,7 +104,6 @@ pub enum LayoutOperation { Revert(RevertLayoutOpt), } - #[derive(StructOpt, Debug)] pub struct AssignRoleOpt { /// Node(s) to which to assign role (prefix of hexadecimal node id) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 70ccf35a..13c60692 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -1,42 +1,40 @@ - //! This module deals with graph algorithms. //! It is used in layout.rs to build the partition to node assignation. use rand::prelude::SliceRandom; use std::cmp::{max, min}; -use std::collections::VecDeque; use std::collections::HashMap; +use std::collections::VecDeque; //Vertex data structures used in all the graphs used in layout.rs. //usize parameters correspond to node/zone/partitions ids. //To understand the vertex roles below, please refer to the formal description //of the layout computation algorithm. -#[derive(Clone,Copy,Debug, PartialEq, Eq, Hash)] -pub enum Vertex{ - Source, - Pup(usize), //The vertex p+ of partition p - Pdown(usize), //The vertex p- of partition p - PZ(usize,usize), //The vertex corresponding to x_(partition p, zone z) - N(usize), //The vertex corresponding to node n - Sink +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Vertex { + Source, + Pup(usize), //The vertex p+ of partition p + Pdown(usize), //The vertex p- of partition p + PZ(usize, usize), //The vertex corresponding to x_(partition p, zone z) + N(usize), //The vertex corresponding to node n + Sink, } - //Edge data structure for the flow algorithm. //The graph is stored as an adjacency list #[derive(Clone, Copy, Debug)] pub struct FlowEdge { - cap: u32, //flow maximal capacity of the edge - flow: i32, //flow value on the edge - dest: usize, //destination vertex id - rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v + cap: u32, //flow maximal capacity of the edge + flow: i32, //flow value on the edge + dest: usize, //destination vertex id + rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v } //Edge data structure for the detection of negative cycles. //The graph is stored as a list of edges (u,v). #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { - w: i32, //weight of the edge + w: i32, //weight of the edge dest: usize, } @@ -47,375 +45,377 @@ impl Edge for WeightedEdge {} //Struct for the graph structure. We do encapsulation here to be able to both //provide user friendly Vertex enum to address vertices, and to use usize indices //and Vec instead of HashMap in the graph algorithm to optimize execution speed. -pub struct Graph{ - vertextoid : HashMap, - idtovertex : Vec, - - graph : Vec< Vec > -} +pub struct Graph { + vertextoid: HashMap, + idtovertex: Vec, -pub type CostFunction = HashMap<(Vertex,Vertex), i32>; - -impl Graph{ - pub fn new(vertices : &[Vertex]) -> Self { - let mut map = HashMap::::new(); - for (i, vert) in vertices.iter().enumerate(){ - map.insert(*vert , i); - } - Graph:: { - vertextoid : map, - idtovertex: vertices.to_vec(), - graph : vec![Vec::< E >::new(); vertices.len() ] - } - } + graph: Vec>, } -impl Graph{ - //This function adds a directed edge to the graph with capacity c, and the - //corresponding reversed edge with capacity 0. - pub fn add_edge(&mut self, u: Vertex, v:Vertex, c: u32) -> Result<(), String>{ - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; - let rev_u = self.graph[idu].len(); - let rev_v = self.graph[idv].len(); - self.graph[idu].push( FlowEdge{cap: c , dest: idv , flow: 0, rev : rev_v} ); - self.graph[idv].push( FlowEdge{cap: 0 , dest: idu , flow: 0, rev : rev_u} ); - Ok(()) - } - - //This function returns the list of vertices that receive a positive flow from - //vertex v. - pub fn get_positive_flow_from(&self , v:Vertex) -> Result< Vec , String>{ - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; - let mut result = Vec::::new(); - for edge in self.graph[idv].iter() { - if edge.flow > 0 { - result.push(self.idtovertex[edge.dest]); - } - } - Ok(result) - } - - - //This function returns the value of the flow incoming to v. - pub fn get_inflow(&self , v:Vertex) -> Result< i32 , String>{ - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0,self.graph[edge.dest][edge.rev].flow); - } - Ok(result) - } - - //This function returns the value of the flow outgoing from v. - pub fn get_outflow(&self , v:Vertex) -> Result< i32 , String>{ - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0,edge.flow); - } - Ok(result) - } - - //This function computes the flow total value by computing the outgoing flow - //from the source. - pub fn get_flow_value(&mut self) -> Result { - self.get_outflow(Vertex::Source) - } - - //This function shuffles the order of the edge lists. It keeps the ids of the - //reversed edges consistent. - fn shuffle_edges(&mut self) { - let mut rng = rand::thread_rng(); - for i in 0..self.graph.len() { - self.graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. - for j in 0..self.graph[i].len() { - let target_v = self.graph[i][j].dest; - let target_rev = self.graph[i][j].rev; - self.graph[target_v][target_rev].rev = j; - } - } - } - - //Computes an upper bound of the flow n the graph - pub fn flow_upper_bound(&self) -> u32{ - let idsource = self.vertextoid[&Vertex::Source]; - let mut flow_upper_bound = 0; - for edge in self.graph[idsource].iter(){ - flow_upper_bound += edge.cap; - } - flow_upper_bound - } - - //This function computes the maximal flow using Dinic's algorithm. It starts with - //the flow values already present in the graph. So it is possible to add some edge to - //the graph, compute a flow, add other edges, update the flow. - pub fn compute_maximal_flow(&mut self) -> Result<(), String> { - if !self.vertextoid.contains_key(&Vertex::Source) { - return Err("The graph does not contain a source.".to_string()); - } - if !self.vertextoid.contains_key(&Vertex::Sink) { - return Err("The graph does not contain a sink.".to_string()); - } - - let idsource = self.vertextoid[&Vertex::Source]; - let idsink = self.vertextoid[&Vertex::Sink]; - - let nb_vertices = self.graph.len(); - - let flow_upper_bound = self.flow_upper_bound(); - - //To ensure the dispersion of the associations generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //the vertices do not consider their neighbours in the same order. - self.shuffle_edges(); - - //We run Dinic's max flow algorithm - loop { - //We build the level array from Dinic's algorithm. - let mut level = vec![None; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((idsource, 0)); - while !fifo.is_empty() { - if let Some((id, lvl)) = fifo.pop_front() { - if level[id] == None { //it means id has not yet been reached - level[id] = Some(lvl); - for edge in self.graph[id].iter() { - if edge.cap as i32 - edge.flow > 0 { - fifo.push_back((edge.dest, lvl + 1)); - } - } - } - } - } - if level[idsink] == None { - //There is no residual flow - break; - } - //Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); - - lifo.push_back((idsource, flow_upper_bound)); - - while let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; - if id == idsink { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while let Some((id, _)) = lifo.pop_back() { - let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i32; - let id_rev = self.graph[id][nbd].dest; - let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i32; - } - lifo.push_back((idsource, flow_upper_bound)); - continue; - } - //else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= self.graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { - next_nbd[*parent] += 1; - } - continue; - } - //else we can try to send flow from id to its nbd - let new_flow = min(f as i32, self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow) as u32; - if new_flow == 0 { - next_nbd[id] += 1; - continue; - } - if let (Some(lvldest), Some(lvlid)) = - (level[self.graph[id][nbd].dest], level[id]){ - if lvldest <= lvlid { - //We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - } - //otherwise, we send flow to nbd. - lifo.push_back((self.graph[id][nbd].dest, new_flow)); - } - } - Ok(()) - } - - //This function takes a flow, and a cost function on the edges, and tries to find an - // equivalent flow with a better cost, by finding improving overflow cycles. It uses - // as subroutine the Bellman Ford algorithm run up to path_length. - // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only - // one needs to be present in the cost function. - pub fn optimize_flow_with_cost(&mut self , cost: &CostFunction, path_length: usize ) - -> Result<(),String>{ - //We build the weighted graph g where we will look for negative cycle - let mut gf = self.build_cost_graph(cost)?; - let mut cycles = gf.list_negative_cycles(path_length); - while !cycles.is_empty() { - //we enumerate negative cycles - for c in cycles.iter(){ - for i in 0..c.len(){ - //We add one flow unit to the edge (u,v) of cycle c - let idu = self.vertextoid[&c[i]]; - let idv = self.vertextoid[&c[(i+1)%c.len()]]; - for j in 0..self.graph[idu].len(){ - //since idu appears at most once in the cycles, we enumerate every - //edge at most once. - let edge = self.graph[idu][j]; - if edge.dest == idv { - self.graph[idu][j].flow += 1; - self.graph[idv][edge.rev].flow -=1; - break; - } - } - } - } - - gf = self.build_cost_graph(cost)?; - cycles = gf.list_negative_cycles(path_length); - } - Ok(()) - } - - //Construct the weighted graph G_f from the flow and the cost function - fn build_cost_graph(&self , cost: &CostFunction) -> Result,String>{ - - let mut g = Graph::::new(&self.idtovertex); - let nb_vertices = self.idtovertex.len(); - for i in 0..nb_vertices { - for edge in self.graph[i].iter() { - if edge.cap as i32 -edge.flow > 0 { - //It is possible to send overflow through this edge - let u = self.idtovertex[i]; - let v = self.idtovertex[edge.dest]; - if cost.contains_key(&(u,v)) { - g.add_edge(u,v, cost[&(u,v)])?; - } - else if cost.contains_key(&(v,u)) { - g.add_edge(u,v, -cost[&(v,u)])?; - } - else{ - g.add_edge(u,v, 0)?; - } - } - } - } - Ok(g) - - } - - +pub type CostFunction = HashMap<(Vertex, Vertex), i32>; + +impl Graph { + pub fn new(vertices: &[Vertex]) -> Self { + let mut map = HashMap::::new(); + for (i, vert) in vertices.iter().enumerate() { + map.insert(*vert, i); + } + Graph:: { + vertextoid: map, + idtovertex: vertices.to_vec(), + graph: vec![Vec::::new(); vertices.len()], + } + } } -impl Graph{ - //This function adds a single directed weighted edge to the graph. - pub fn add_edge(&mut self, u: Vertex, v:Vertex, w: i32) -> Result<(), String>{ - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; - self.graph[idu].push( WeightedEdge{ w , dest: idv} ); - Ok(()) - } - - //This function lists the negative cycles it manages to find after path_length - //iterations of the main loop of the Bellman-Ford algorithm. For the classical - //algorithm, path_length needs to be equal to the number of vertices. However, - //for particular graph structures like our case, the algorithm is still correct - //when path_length is the length of the longest possible simple path. - //See the formal description of the algorithm for more details. - fn list_negative_cycles(&self, path_length: usize) -> Vec< Vec > { - - let nb_vertices = self.graph.len(); - - //We start with every vertex at distance 0 of some imaginary extra -1 vertex. - let mut distance = vec![0 ; nb_vertices]; - //The prev vector collects for every vertex from where does the shortest path come - let mut prev = vec![None; nb_vertices]; - - for _ in 0..path_length +1 { - for id in 0..nb_vertices{ - for e in self.graph[id].iter(){ - if distance[id] + e.w < distance[e.dest] { - distance[e.dest] = distance[id] + e.w; - prev[e.dest] = Some(id); - } - } - } - } - - - //If self.graph contains a negative cycle, then at this point the graph described - //by prev (which is a directed 1-forest/functional graph) - //must contain a cycle. We list the cycles of prev. - let cycles_prev = cycles_of_1_forest(&prev); - - //Remark that the cycle in prev is in the reverse order compared to the cycle - //in the graph. Thus the .rev(). - return cycles_prev.iter().map(|cycle| cycle.iter().rev().map( - |id| self.idtovertex[*id] - ).collect() ).collect(); - } - +impl Graph { + //This function adds a directed edge to the graph with capacity c, and the + //corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push(FlowEdge { + cap: c, + dest: idv, + flow: 0, + rev: rev_v, + }); + self.graph[idv].push(FlowEdge { + cap: 0, + dest: idu, + flow: 0, + rev: rev_u, + }); + Ok(()) + } + + //This function returns the list of vertices that receive a positive flow from + //vertex v. + pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = Vec::::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.idtovertex[edge.dest]); + } + } + Ok(result) + } + + //This function returns the value of the flow incoming to v. + pub fn get_inflow(&self, v: Vertex) -> Result { + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, self.graph[edge.dest][edge.rev].flow); + } + Ok(result) + } + + //This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self, v: Vertex) -> Result { + if !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idv = self.vertextoid[&v]; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, edge.flow); + } + Ok(result) + } + + //This function computes the flow total value by computing the outgoing flow + //from the source. + pub fn get_flow_value(&mut self) -> Result { + self.get_outflow(Vertex::Source) + } + + //This function shuffles the order of the edge lists. It keeps the ids of the + //reversed edges consistent. + fn shuffle_edges(&mut self) { + let mut rng = rand::thread_rng(); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + //We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + //Computes an upper bound of the flow n the graph + pub fn flow_upper_bound(&self) -> u32 { + let idsource = self.vertextoid[&Vertex::Source]; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter() { + flow_upper_bound += edge.cap; + } + flow_upper_bound + } + + //This function computes the maximal flow using Dinic's algorithm. It starts with + //the flow values already present in the graph. So it is possible to add some edge to + //the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + if !self.vertextoid.contains_key(&Vertex::Source) { + return Err("The graph does not contain a source.".to_string()); + } + if !self.vertextoid.contains_key(&Vertex::Sink) { + return Err("The graph does not contain a sink.".to_string()); + } + + let idsource = self.vertextoid[&Vertex::Source]; + let idsink = self.vertextoid[&Vertex::Sink]; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound(); + + //To ensure the dispersion of the associations generated by the + //assignation, we shuffle the neighbours of the nodes. Hence, + //the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + //We run Dinic's max flow algorithm + loop { + //We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while !fifo.is_empty() { + if let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { + //it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i32 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + } + if level[idsink] == None { + //There is no residual flow + break; + } + //Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = VecDeque::new(); + + lifo.push_back((idsource, flow_upper_bound)); + + while let Some((id_tmp, f_tmp)) = lifo.back() { + let id = *id_tmp; + let f = *f_tmp; + if id == idsink { + //The DFS reached the sink, we can add a + //residual flow. + lifo.pop_back(); + while let Some((id, _)) = lifo.pop_back() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i32; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i32; + } + lifo.push_back((idsource, flow_upper_bound)); + continue; + } + //else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + //There is nothing to explore from id anymore + lifo.pop_back(); + if let Some((parent, _)) = lifo.back() { + next_nbd[*parent] += 1; + } + continue; + } + //else we can try to send flow from id to its nbd + let new_flow = min( + f as i32, + self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow, + ) as u32; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } + if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { + if lvldest <= lvlid { + //We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + //otherwise, we send flow to nbd. + lifo.push_back((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + //This function takes a flow, and a cost function on the edges, and tries to find an + // equivalent flow with a better cost, by finding improving overflow cycles. It uses + // as subroutine the Bellman Ford algorithm run up to path_length. + // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only + // one needs to be present in the cost function. + pub fn optimize_flow_with_cost( + &mut self, + cost: &CostFunction, + path_length: usize, + ) -> Result<(), String> { + //We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while !cycles.is_empty() { + //we enumerate negative cycles + for c in cycles.iter() { + for i in 0..c.len() { + //We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertextoid[&c[i]]; + let idv = self.vertextoid[&c[(i + 1) % c.len()]]; + for j in 0..self.graph[idu].len() { + //since idu appears at most once in the cycles, we enumerate every + //edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -= 1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + Ok(()) + } + + //Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { + let mut g = Graph::::new(&self.idtovertex); + let nb_vertices = self.idtovertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i32 - edge.flow > 0 { + //It is possible to send overflow through this edge + let u = self.idtovertex[i]; + let v = self.idtovertex[edge.dest]; + if cost.contains_key(&(u, v)) { + g.add_edge(u, v, cost[&(u, v)])?; + } else if cost.contains_key(&(v, u)) { + g.add_edge(u, v, -cost[&(v, u)])?; + } else { + g.add_edge(u, v, 0)?; + } + } + } + } + Ok(g) + } } +impl Graph { + //This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { + if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { + return Err("The graph does not contain the provided vertex.".to_string()); + } + let idu = self.vertextoid[&u]; + let idv = self.vertextoid[&v]; + self.graph[idu].push(WeightedEdge { w, dest: idv }); + Ok(()) + } + + //This function lists the negative cycles it manages to find after path_length + //iterations of the main loop of the Bellman-Ford algorithm. For the classical + //algorithm, path_length needs to be equal to the number of vertices. However, + //for particular graph structures like our case, the algorithm is still correct + //when path_length is the length of the longest possible simple path. + //See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec> { + let nb_vertices = self.graph.len(); + + //We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0; nb_vertices]; + //The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length + 1 { + for id in 0..nb_vertices { + for e in self.graph[id].iter() { + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + //If self.graph contains a negative cycle, then at this point the graph described + //by prev (which is a directed 1-forest/functional graph) + //must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + //Remark that the cycle in prev is in the reverse order compared to the cycle + //in the graph. Thus the .rev(). + return cycles_prev + .iter() + .map(|cycle| cycle.iter().rev().map(|id| self.idtovertex[*id]).collect()) + .collect(); + } +} //This function returns the list of cycles of a directed 1 forest. It does not //check for the consistency of the input. -fn cycles_of_1_forest(forest: &[Option]) -> Vec> { - let mut cycles = Vec::>::new(); - let mut time_of_discovery = vec![None; forest.len()]; - - for t in 0..forest.len(){ - let mut id = t; - //while we are on a valid undiscovered node - while time_of_discovery[id] == None { - time_of_discovery[id] = Some(t); - if let Some(i) = forest[id] { - id = i; - } - else{ - break; - } - } - if forest[id] != None && time_of_discovery[id] == Some(t) { - //We discovered an id that we explored at this iteration t. - //It means we are on a cycle - let mut cy = vec![id; 1]; - let mut id2 = id; - while let Some(id_next) = forest[id2] { - id2 = id_next; - if id2 != id { - cy.push(id2); - } - else { - break; - } - } - cycles.push(cy); - } - } - cycles +fn cycles_of_1_forest(forest: &[Option]) -> Vec> { + let mut cycles = Vec::>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len() { + let mut id = t; + //while we are on a valid undiscovered node + while time_of_discovery[id] == None { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } else { + break; + } + } + if forest[id] != None && time_of_discovery[id] == Some(t) { + //We discovered an id that we explored at this iteration t. + //It means we are on a cycle + let mut cy = vec![id; 1]; + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; + if id2 != id { + cy.push(id2); + } else { + break; + } + } + cycles.push(cy); + } + } + cycles } - - diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 976f94af..3a6f42ee 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap, Lww}; +use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::error::*; @@ -27,11 +27,11 @@ pub struct ClusterLayout { pub version: u64, pub replication_factor: usize, - - //This attribute is only used to retain the previously computed partition size, - //to know to what extent does it change with the layout update. - pub partition_size: u32, - pub parameters: LayoutParameters, + + //This attribute is only used to retain the previously computed partition size, + //to know to what extent does it change with the layout update. + pub partition_size: u32, + pub parameters: LayoutParameters, pub roles: LwwMap, @@ -39,7 +39,7 @@ pub struct ClusterLayout { /// in the system (this includes gateway nodes). /// The order here is different than the vec stored by `roles`, because: /// 1. non-gateway nodes are first so that they have lower numbers holding - /// in u8 (the number of non-gateway nodes is at most 256). + /// in u8 (the number of non-gateway nodes is at most 256). /// 2. nodes that don't have a role are excluded (but they need to /// stay in the CRDT as tombstones) pub node_id_vec: Vec, @@ -49,7 +49,7 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Role changes which are staged for the next version of the layout - pub staged_parameters: Lww, + pub staged_parameters: Lww, pub staging: LwwMap, pub staging_hash: Hash, } @@ -58,14 +58,14 @@ pub struct ClusterLayout { ///algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { - pub zone_redundancy:usize, + pub zone_redundancy: usize, } impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -const NB_PARTITIONS : usize = 1usize << PARTITION_BITS; +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -96,27 +96,28 @@ impl NodeRole { } } - pub fn tags_string(&self) -> String { - let mut tags = String::new(); - if self.tags.is_empty() { - return tags - } - tags.push_str(&self.tags[0].clone()); - for t in 1..self.tags.len(){ - tags.push(','); - tags.push_str(&self.tags[t].clone()); - } - tags - } + pub fn tags_string(&self) -> String { + let mut tags = String::new(); + if self.tags.is_empty() { + return tags; + } + tags.push_str(&self.tags[0].clone()); + for t in 1..self.tags.len() { + tags.push(','); + tags.push_str(&self.tags[t].clone()); + } + tags + } } impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - - //We set the default zone redundancy to be equal to the replication factor, - //i.e. as strict as possible. - let parameters = LayoutParameters{ zone_redundancy: replication_factor}; - let staged_parameters = Lww::::new(parameters.clone()); + //We set the default zone redundancy to be equal to the replication factor, + //i.e. as strict as possible. + let parameters = LayoutParameters { + zone_redundancy: replication_factor, + }; + let staged_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); @@ -124,12 +125,12 @@ impl ClusterLayout { ClusterLayout { version: 0, replication_factor, - partition_size: 0, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), - parameters, - staged_parameters, + parameters, + staged_parameters, staging: empty_lwwmap, staging_hash: empty_lwwmap_hash, } @@ -142,11 +143,10 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); - self.staged_parameters.merge(&other.staged_parameters); + let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); + self.staged_parameters.merge(&other.staged_parameters); self.staging.merge(&other.staging); - let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); let stage_changed = new_staging_hash != self.staging_hash; @@ -158,7 +158,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self,Message), Error> { + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { match version { None => { let error = r#" @@ -177,14 +177,14 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.merge(&self.staging); self.roles.retain(|(_, _, v)| v.0.is_some()); - let msg = self.calculate_partition_assignation()?; + let msg = self.calculate_partition_assignation()?; self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); self.version += 1; - Ok((self,msg)) + Ok((self, msg)) } pub fn revert_staged_changes(mut self, version: Option) -> Result { @@ -229,44 +229,52 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn useful_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity != None => result.push(*uuid), - _ => () - } - } - result - } - - ///Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid : &Uuid) -> Result { - match self.node_role(uuid) { - Some(role) => Ok(role.zone.clone()), - _ => Err(Error::Message("The Uuid does not correspond to a node present in the cluster.".into())) - } - } - - ///Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid : &Uuid) -> Result { - match self.node_role(uuid) { - Some(NodeRole{capacity : Some(cap), zone: _, tags: _}) => Ok(*cap), - _ => Err(Error::Message("The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity.".into())) - } - } - - ///Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { - let mut total_capacity = 0; - for uuid in self.useful_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; - } - Ok(total_capacity) - } + ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn useful_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => (), + } + } + result + } + + ///Given a node uuids, this function returns the label of its zone + pub fn get_node_zone(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + ///Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + ///Returns the sum of capacities of non gateway nodes in the cluster + pub fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.useful_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } /// Check a cluster layout for internal consistency /// returns true if consistent, false if error @@ -311,580 +319,689 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that every partition is associated to distinct nodes - let rf = self.replication_factor; - for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf*p..rf*(p+1)].to_vec(); - if nodes_of_p.iter().unique().count() != rf { - return false; - } - //Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p.iter() - .map(|n| self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.")); - let redundancy = self.parameters.zone_redundancy; - if zones_of_p.unique().count() < redundancy { - return false; - } - } - - //Check that the nodes capacities is consistent with the stored partitions - let mut node_usage = vec![0; MAX_NODE_NUMBER]; - for n in self.ring_assignation_data.iter() { - node_usage[*n as usize] += 1; - } - for (n, usage) in node_usage.iter().enumerate(){ - if *usage > 0 { - let uuid = self.node_id_vec[n]; - if usage*self.partition_size > self.get_node_capacity(&uuid) - .expect("Critical Error"){ - return false; - } - } - } - - //Check that the partition size stored is the one computed by the asignation - //algorithm. - let cl2 = self.clone(); - let (_ , zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); - let partition_size = cl2.compute_optimal_partition_size(&zone_to_id).expect("Critical Error"); - if partition_size != self.partition_size { - return false; - } + //Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return false; + } + //Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p.iter().map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }); + let redundancy = self.parameters.zone_redundancy; + if zones_of_p.unique().count() < redundancy { + return false; + } + } + + //Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignation_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + if usage * self.partition_size + > self.get_node_capacity(&uuid).expect("Critical Error") + { + return false; + } + } + } + //Check that the partition size stored is the one computed by the asignation + //algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); + let partition_size = cl2 + .compute_optimal_partition_size(&zone_to_id) + .expect("Critical Error"); + if partition_size != self.partition_size { + return false; + } true } - } impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor - /// and the zone redundancy parameter It maximizes the capacity of a + /// and the zone redundancy parameter It maximizes the capacity of a /// partition (assuming all partitions have the same size). /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - /// Staged changes must be merged with nodes roles before calling this function. - pub fn calculate_partition_assignation(&mut self) -> Result { + /// Staged changes must be merged with nodes roles before calling this function. + pub fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. - - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with the new ids - let old_assignation_opt = self.update_node_id_vec()?; - - let redundancy = self.staged_parameters.get().zone_redundancy; - - - let mut msg = Message::new(); - msg.push(format!("Computation of a new cluster layout where partitions are \ - replicated {} times on at least {} distinct zones.", self.replication_factor, redundancy)); - - //We generate for once numerical ids for the zones of non gateway nodes, - //to use them as indices in the flow graphs. - let (id_to_zone , zone_to_id) = self.generate_useful_zone_ids()?; - - let nb_useful_nodes = self.useful_nodes().len(); - msg.push(format!("The cluster contains {} nodes spread over {} zones.", - nb_useful_nodes, id_to_zone.len())); - if nb_useful_nodes < self.replication_factor{ - return Err(Error::Message(format!("The number of nodes with positive \ + + //We update the node ids, since the node role list might have changed with the + //changes in the layout. We retrieve the old_assignation reframed with the new ids + let old_assignation_opt = self.update_node_id_vec()?; + + let redundancy = self.staged_parameters.get().zone_redundancy; + + let mut msg = Message::new(); + msg.push(format!( + "Computation of a new cluster layout where partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, redundancy + )); + + //We generate for once numerical ids for the zones of non gateway nodes, + //to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; + + let nb_useful_nodes = self.useful_nodes().len(); + msg.push(format!( + "The cluster contains {} nodes spread over {} zones.", + nb_useful_nodes, + id_to_zone.len() + )); + if nb_useful_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_useful_nodes, self.replication_factor))); - } - if id_to_zone.len() < redundancy { - return Err(Error::Message(format!("The number of zones with non-gateway \ + nb_useful_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", - id_to_zone.len() , redundancy))); - } - - //We compute the optimal partition size - //Capacities should be given in a unit so that partition size is at least 100. - //In this case, integer rounding plays a marginal role in the percentages of - //optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; - - if old_assignation_opt != None { - msg.push(format!("Given the replication and redundancy constraint, the \ + id_to_zone.len(), + redundancy + ))); + } + + //We compute the optimal partition size + //Capacities should be given in a unit so that partition size is at least 100. + //In this case, integer rounding plays a marginal role in the percentages of + //optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + + if old_assignation_opt != None { + msg.push(format!( + "Given the replication and redundancy constraint, the \ optimal size of a partition is {}. In the previous layout, it used to \ - be {} (the zone redundancy was {}).", partition_size, self.partition_size, - self.parameters.zone_redundancy)); - } - else { - msg.push(format!("Given the replication and redundancy constraints, the \ - optimal size of a partition is {}.", partition_size)); - } - self.partition_size = partition_size; - self.parameters = self.staged_parameters.get().clone(); - - if partition_size < 100 { - msg.push("WARNING: The partition size is low (< 100), you might consider to \ - provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb).".into()); - } - - //We compute a first flow/assignment that is heuristically close to the previous - //assignment - let mut gflow = self.compute_candidate_assignment( &zone_to_id, &old_assignation_opt)?; - if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; - } - - msg.append(&mut self.output_stat(&gflow, &old_assignation_opt, &zone_to_id,&id_to_zone)?); - msg.push("".to_string()); - - //We update the layout structure - self.update_ring_from_flow(id_to_zone.len() , &gflow)?; - Ok(msg) - } + be {} (the zone redundancy was {}).", + partition_size, self.partition_size, self.parameters.zone_redundancy + )); + } else { + msg.push(format!( + "Given the replication and redundancy constraints, the \ + optimal size of a partition is {}.", + partition_size + )); + } + self.partition_size = partition_size; + self.parameters = self.staged_parameters.get().clone(); + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), you might consider to \ + provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb)." + .into(), + ); + } + + //We compute a first flow/assignment that is heuristically close to the previous + //assignment + let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignation_opt)?; + if let Some(assoc) = &old_assignation_opt { + //We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } + + msg.append(&mut self.output_stat( + &gflow, + &old_assignation_opt, + &zone_to_id, + &id_to_zone, + )?); + msg.push("".to_string()); + + //We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + Ok(msg) + } /// The LwwMap of node roles might have changed. This function updates the node_id_vec /// and returns the assignation given by ring, with the new indices of the nodes, and /// None if the node is not present anymore. /// We work with the assumption that only this function and calculate_new_assignation /// do modify assignation_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result< Option< Vec > > ,Error> { - // (1) We compute the new node list - //Non gateway nodes should be coded on 8bits, hence they must be first in the list - //We build the new node ids - let mut new_non_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) - .map(|(k, _, _)| *k).collect(); - - if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(Error::Message(format!("There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", MAX_NODE_NUMBER) )); - } - - let mut new_gateway_nodes: Vec = self.roles.items().iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) - .map(|(k, _, _)| *k).collect(); - - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.append(&mut new_non_gateway_nodes); - new_node_id_vec.append(&mut new_gateway_nodes); - - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); - - // (2) We retrieve the old association - //We rewrite the old association with the new indices. We only consider partition - //to node assignations where the node is still in use. - let nb_partitions = 1usize << PARTITION_BITS; - let mut old_assignation = vec![ Vec::::new() ; nb_partitions]; - - if self.ring_assignation_data.is_empty() { - //This is a new association - return Ok(None); - } - if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { - return Err(Error::Message("The old assignation does not have a size corresponding to \ - the old replication factor or the number of partitions.".into())); - } - - //We build a translation table between the uuid and new ids - let mut uuid_to_new_id = HashMap::::new(); - - //We add the indices of only the new non-gateway nodes that can be used in the - //association ring - for (i, uuid) in new_node_id_vec.iter().enumerate() { - uuid_to_new_id.insert(*uuid, i ); - } - - let rf= self.replication_factor; - for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { - for old_id in &self.ring_assignation_data[p*rf..(p+1)*rf] { - let uuid = old_node_id_vec[*old_id as usize]; - if uuid_to_new_id.contains_key(&uuid) { - old_assign_p.push(uuid_to_new_id[&uuid]); - } - } - } - - //We write the ring - self.ring_assignation_data = Vec::::new(); - - if !self.check() { - return Err(Error::Message("Critical error: The computed layout happens to be incorrect".into())); - } - - Ok(Some(old_assignation)) + fn update_node_id_vec(&mut self) -> Result>>, Error> { + // (1) We compute the new node list + //Non gateway nodes should be coded on 8bits, hence they must be first in the list + //We build the new node ids + let mut new_non_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); + } + + let mut new_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.append(&mut new_non_gateway_nodes); + new_node_id_vec.append(&mut new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + //We rewrite the old association with the new indices. We only consider partition + //to node assignations where the node is still in use. + let nb_partitions = 1usize << PARTITION_BITS; + let mut old_assignation = vec![Vec::::new(); nb_partitions]; + + if self.ring_assignation_data.is_empty() { + //This is a new association + return Ok(None); + } + if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + return Err(Error::Message( + "The old assignation does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + //We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + //We add the indices of only the new non-gateway nodes that can be used in the + //association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); + } + + let rf = self.replication_factor; + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { + for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } + + //We write the ring + self.ring_assignation_data = Vec::::new(); + + if !self.check() { + return Err(Error::Message( + "Critical error: The computed layout happens to be incorrect".into(), + )); + } + + Ok(Some(old_assignation)) } + ///This function generates ids for the zone of the nodes appearing in + ///self.node_id_vec. + fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.useful_nodes().iter() { + if self.roles.get(uuid) == None { + return Err(Error::Message( + "The uuid was not found in the node roles (this should \ + not happen, it might be a critical error)." + .into(), + )); + } + if let Some(r) = self.node_role(uuid) { + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + } + Ok((id_to_zone, zone_to_id)) + } - ///This function generates ids for the zone of the nodes appearing in - ///self.node_id_vec. - fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap),Error>{ - let mut id_to_zone = Vec::::new(); - let mut zone_to_id = HashMap::::new(); - - for uuid in self.useful_nodes().iter() { - if self.roles.get(uuid) == None { - return Err(Error::Message("The uuid was not found in the node roles (this should \ - not happen, it might be a critical error).".into())); - } - if let Some(r) = self.node_role(uuid) { - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone() , id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - } - } - Ok((id_to_zone, zone_to_id)) - } - - ///This function computes by dichotomy the largest realizable partition size, given - ///the layout. - fn compute_optimal_partition_size(&self, zone_to_id: &HashMap) -> Result{ - let nb_partitions = 1usize << PARTITION_BITS; - let empty_set = HashSet::<(usize,usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - return Err(Error::Message("The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1.".into())); - } - - let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; - while s_down +1 < s_up { - g = self.generate_flow_graph((s_down+s_up)/2, zone_to_id, &empty_set)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (nb_partitions*self.replication_factor).try_into().unwrap() { - s_up = (s_down+s_up)/2; - } - else { - s_down = (s_down+s_up)/2; - } - } - - Ok(s_down) - } - - fn generate_graph_vertices(nb_zones : usize, nb_nodes : usize) -> Vec { - let mut vertices = vec![Vertex::Source, Vertex::Sink]; - for p in 0..NB_PARTITIONS { - vertices.push(Vertex::Pup(p)); - vertices.push(Vertex::Pdown(p)); - for z in 0..nb_zones { - vertices.push(Vertex::PZ(p, z)); - } - } - for n in 0..nb_nodes { - vertices.push(Vertex::N(n)); - } - vertices - } - - fn generate_flow_graph(&self, size: u32, zone_to_id: &HashMap, exclude_assoc : &HashSet<(usize,usize)>) -> Result, Error> { - let vertices = ClusterLayout::generate_graph_vertices(zone_to_id.len(), - self.useful_nodes().len()); - let mut g= Graph::::new(&vertices); - let nb_zones = zone_to_id.len(); - let redundancy = self.staged_parameters.get().zone_redundancy; - for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; - g.add_edge(Vertex::Source, Vertex::Pdown(p), (self.replication_factor - redundancy) as u32)?; - for z in 0..nb_zones { - g.add_edge(Vertex::Pup(p) , Vertex::PZ(p,z) , 1)?; - g.add_edge(Vertex::Pdown(p) , Vertex::PZ(p,z) , - self.replication_factor as u32)?; - } - } - for n in 0..self.useful_nodes().len() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity/size)?; - for p in 0..NB_PARTITIONS { - if !exclude_assoc.contains(&(p,n)) { - g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; - } - } - } - Ok(g) - } - - - fn compute_candidate_assignment(&self, zone_to_id: &HashMap, - old_assoc_opt : &Option >>) -> Result, Error > { - - //We list the edges that are not used in the old association - let mut exclude_edge = HashSet::<(usize,usize)>::new(); - if let Some(old_assoc) = old_assoc_opt { - let nb_nodes = self.useful_nodes().len(); - for (p, old_assoc_p) in old_assoc.iter().enumerate() { - for n in 0..nb_nodes { - exclude_edge.insert((p,n)); - } - for n in old_assoc_p.iter() { - exclude_edge.remove(&(p,*n)); - } - } - } - - //We compute the best flow using only the edges used in the old assoc - let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge )?; - g.compute_maximal_flow()?; - for (p,n) in exclude_edge.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; - g.add_edge(Vertex::PZ(*p,node_zone), Vertex::N(*n), 1)?; - } - g.compute_maximal_flow()?; - Ok(g) - } - - fn minimize_rebalance_load(&self, gflow: &mut Graph, zone_to_id: &HashMap, old_assoc : &[Vec ]) -> Result<(), Error > { - let mut cost = CostFunction::new(); - for (p, assoc_p) in old_assoc.iter().enumerate(){ - for n in assoc_p.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; - cost.insert((Vertex::PZ(p,node_zone), Vertex::N(*n)), -1); - } - } - let nb_nodes = self.useful_nodes().len(); - let path_length = 4*nb_nodes; - gflow.optimize_flow_with_cost(&cost, path_length)?; - - Ok(()) - } - - fn update_ring_from_flow(&mut self, nb_zones : usize, gflow: &Graph ) -> Result<(), Error>{ - self.ring_assignation_data = Vec::::new(); - for p in 0..NB_PARTITIONS { - for z in 0..nb_zones { - let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - for vertex in assoc_vertex.iter() { - if let Vertex::N(n) = vertex { - self.ring_assignation_data.push((*n).try_into().unwrap()); - } - } - } - } - - if self.ring_assignation_data.len() != NB_PARTITIONS*self.replication_factor { - return Err(Error::Message("Critical Error : the association ring we produced does not \ - have the right size.".into())); - } - Ok(()) - } - - - //This function returns a message summing up the partition repartition of the new - //layout. - fn output_stat(&self , gflow : &Graph, - old_assoc_opt : &Option< Vec> >, - zone_to_id: &HashMap, - id_to_zone : &[String]) -> Result{ - let mut msg = Message::new(); - + ///This function computes by dichotomy the largest realizable partition size, given + ///the layout. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap, + ) -> Result { let nb_partitions = 1usize << PARTITION_BITS; - let used_cap = self.partition_size * nb_partitions as u32 * - self.replication_factor as u32; - let total_cap = self.get_total_capacity()?; - let percent_cap = 100.0*(used_cap as f32)/(total_cap as f32); - msg.push(format!("Available capacity / Total cluster capacity: {} / {} ({:.1} %)", - used_cap , total_cap , percent_cap )); - msg.push("".into()); - msg.push("If the percentage is to low, it might be that the \ + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? + < (nb_partitions * self.replication_factor) + .try_into() + .unwrap() + { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? + < (nb_partitions * self.replication_factor) + .try_into() + .unwrap() + { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } + + Ok(s_down) + } + + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } + + fn generate_flow_graph( + &self, + size: u32, + zone_to_id: &HashMap, + exclude_assoc: &HashSet<(usize, usize)>, + ) -> Result, Error> { + let vertices = + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); + let mut g = Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + let redundancy = self.staged_parameters.get().zone_redundancy; + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - redundancy) as u32, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u32, + )?; + } + } + for n in 0..self.useful_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + Ok(g) + } + + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap, + old_assoc_opt: &Option>>, + ) -> Result, Error> { + //We list the edges that are not used in the old association + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(old_assoc) = old_assoc_opt { + let nb_nodes = self.useful_nodes().len(); + for (p, old_assoc_p) in old_assoc.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in old_assoc_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } + + //We compute the best flow using only the edges used in the old assoc + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + g.compute_maximal_flow()?; + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) + } + + fn minimize_rebalance_load( + &self, + gflow: &mut Graph, + zone_to_id: &HashMap, + old_assoc: &[Vec], + ) -> Result<(), Error> { + let mut cost = CostFunction::new(); + for (p, assoc_p) in old_assoc.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); + } + } + let nb_nodes = self.useful_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + Ok(()) + } + + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph, + ) -> Result<(), Error> { + self.ring_assignation_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignation_data.push((*n).try_into().unwrap()); + } + } + } + } + + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) + } + + //This function returns a message summing up the partition repartition of the new + //layout. + fn output_stat( + &self, + gflow: &Graph, + old_assoc_opt: &Option>>, + zone_to_id: &HashMap, + id_to_zone: &[String], + ) -> Result { + let mut msg = Message::new(); + + let nb_partitions = 1usize << PARTITION_BITS; + let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push(format!( + "Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + used_cap, total_cap, percent_cap + )); + msg.push("".into()); + msg.push( + "If the percentage is to low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ - See the detailed statistics below and look for saturated nodes/zones.".into()); - msg.push(format!("Recall that because of the replication factor, the actual available \ - storage capacity is {} / {} = {}.", - used_cap , self.replication_factor , - used_cap/self.replication_factor as u32)); - - //We define and fill in the following tables - let storing_nodes = self.useful_nodes(); - let mut new_partitions = vec![0; storing_nodes.len()]; - let mut stored_partitions = vec![0; storing_nodes.len()]; - - let mut new_partitions_zone = vec![0; id_to_zone.len()]; - let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - - for p in 0..nb_partitions { - for z in 0..id_to_zone.len() { - let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p,z))?; - if !pz_nodes.is_empty() { - stored_partitions_zone[z] += 1; - if let Some(old_assoc) = old_assoc_opt { - let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { - old_zones_of_p.push( - zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } - } - for vert in pz_nodes.iter() { - if let Vertex::N(n) = *vert { - stored_partitions[n] += 1; - if let Some(old_assoc) = old_assoc_opt { - if !old_assoc[p].contains(&n) { - new_partitions[n] += 1; - } - } - } - } - } - } - - if *old_assoc_opt == None { - new_partitions = stored_partitions.clone(); - new_partitions_zone = stored_partitions_zone.clone(); - } - - //We display the statistics - - msg.push("".into()); - if *old_assoc_opt != None { - let total_new_partitions : usize = new_partitions.iter().sum(); - msg.push(format!("A total of {} new copies of partitions need to be \ - transferred.", total_new_partitions)); - } - msg.push("".into()); - msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); - - for z in 0..id_to_zone.len(){ - let mut nodes_of_z = Vec::::new(); - for n in 0..storing_nodes.len(){ - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { - nodes_of_z.push(n); - } - } - let replicated_partitions : usize = nodes_of_z.iter() - .map(|n| stored_partitions[*n]).sum(); - msg.push("".into()); - - msg.push(format!("Zone {}: {} distinct partitions stored ({} new, \ - {} partition copies) ", id_to_zone[z], stored_partitions_zone[z], - new_partitions_zone[z], replicated_partitions)); - - let available_cap_z : u32 = self.partition_size*replicated_partitions as u32; - let mut total_cap_z = 0; - for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; - } - let percent_cap_z = 100.0*(available_cap_z as f32)/(total_cap_z as f32); - msg.push(format!(" Available capacity / Total capacity: {}/{} ({:.1}%).", - available_cap_z, total_cap_z, percent_cap_z)); - - for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u32 *self.partition_size; - let total_cap_n =self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self.node_role(&self.node_id_vec[*n]) - .ok_or("Node not found."))?.tags_string(); - msg.push(format!(" Node {}: {} partitions ({} new) ; \ - available/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec()[0..2].to_vec().encode_hex::(), - stored_partitions[*n], - new_partitions[*n], available_cap_n, total_cap_n, - (available_cap_n as f32)/(total_cap_n as f32)*100.0 , - tags_n)); - } - } - - Ok(msg) - } - + See the detailed statistics below and look for saturated nodes/zones." + .into(), + ); + msg.push(format!( + "Recall that because of the replication factor, the actual available \ + storage capacity is {} / {} = {}.", + used_cap, + self.replication_factor, + used_cap / self.replication_factor as u32 + )); + + //We define and fill in the following tables + let storing_nodes = self.useful_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..nb_partitions { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(old_assoc) = old_assoc_opt { + let mut old_zones_of_p = Vec::::new(); + for n in old_assoc[p].iter() { + old_zones_of_p + .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(old_assoc) = old_assoc_opt { + if !old_assoc[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if *old_assoc_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } + + //We display the statistics + + msg.push("".into()); + if *old_assoc_opt != None { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + } + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); + + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + msg.push("".into()); + + msg.push(format!( + "Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", + id_to_zone[z], + stored_partitions_zone[z], + new_partitions_zone[z], + replicated_partitions + )); + + let available_cap_z: u32 = self.partition_size * replicated_partitions as u32; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + msg.push(format!( + " Available capacity / Total capacity: {}/{} ({:.1}%).", + available_cap_z, total_cap_z, percent_cap_z + )); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u32 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self + .node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))? + .tags_string(); + msg.push(format!( + " Node {}: {} partitions ({} new) ; \ + available/total capacity: {} / {} ({:.1}%) ; tags:{}", + &self.node_id_vec[*n].to_vec()[0..2] + .to_vec() + .encode_hex::(), + stored_partitions[*n], + new_partitions[*n], + available_cap_n, + total_cap_n, + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + tags_n + )); + } + } + + Ok(msg) + } } //==================================================================================== #[cfg(test)] mod tests { - use super::{*,Error}; - use std::cmp::min; - - - //This function checks that the partition size S computed is at least better than the - //one given by a very naive algorithm. To do so, we try to run the naive algorithm - //assuming a partion size of S+1. If we succed, it means that the optimal assignation - //was not optimal. The naive algorithm is the following : - //- we compute the max number of partitions associated to every node, capped at the - //partition number. It gives the number of tokens of every node. - //- every zone has a number of tokens equal to the sum of the tokens of its nodes. - //- we cycle over the partitions and associate zone tokens while respecting the - //zone redundancy constraint. - //NOTE: the naive algorithm is not optimal. Counter example: - //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - //With these parameters, the naive algo fails, whereas there is a solution: - //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &ClusterLayout) -> Result { - let over_size = cl.partition_size +1; - let mut zone_token = HashMap::::new(); - let nb_partitions = 1usize << PARTITION_BITS; - - let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.useful_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; - zone_token.insert(z.clone(), zone_token[&z] + min(nb_partitions , (c/over_size) as usize)); - } - - //For every partition, we count the number of zone already associated and - //the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z,t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; nb_partitions]; - let mut last_zone = vec![zones.len(); nb_partitions]; - - let mut curr_zone = 0; - - let redundancy = cl.parameters.zone_redundancy; - - for replic in 0..cl.replication_factor { - for p in 0..nb_partitions { - while id_zone_token[curr_zone] == 0 || - (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg : &Message) { - for s in msg.iter(){ - println!("{}",s); - } - } + use super::{Error, *}; + use std::cmp::min; + + //This function checks that the partition size S computed is at least better than the + //one given by a very naive algorithm. To do so, we try to run the naive algorithm + //assuming a partion size of S+1. If we succed, it means that the optimal assignation + //was not optimal. The naive algorithm is the following : + //- we compute the max number of partitions associated to every node, capped at the + //partition number. It gives the number of tokens of every node. + //- every zone has a number of tokens equal to the sum of the tokens of its nodes. + //- we cycle over the partitions and associate zone tokens while respecting the + //zone redundancy constraint. + //NOTE: the naive algorithm is not optimal. Counter example: + //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + //With these parameters, the naive algo fails, whereas there is a solution: + //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + let nb_partitions = 1usize << PARTITION_BITS; + + let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.useful_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(nb_partitions, (c / over_size) as usize), + ); + } + + //For every partition, we count the number of zone already associated and + //the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; nb_partitions]; + let mut last_zone = vec![zones.len(); nb_partitions]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..nb_partitions { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } + + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } + } fn update_layout( cl: &mut ClusterLayout, node_id_vec: &Vec, node_capacity_vec: &Vec, node_zone_vec: &Vec, - zone_redundancy: usize + zone_redundancy: usize, ) { for i in 0..node_id_vec.len() { if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { @@ -901,12 +1018,12 @@ mod tests { ); cl.roles.merge(&update); } - cl.staged_parameters = Lww::::new(LayoutParameters{zone_redundancy}); + cl.staged_parameters = Lww::::new(LayoutParameters { zone_redundancy }); } #[test] fn test_assignation() { - let mut node_id_vec = vec![1, 2, 3]; + let mut node_id_vec = vec![1, 2, 3]; let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"] .into_iter() @@ -936,11 +1053,12 @@ mod tests { assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); - node_capacity_vec = vec![4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000]; + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); show_msg(&cl.calculate_partition_assignation().unwrap()); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); - } } diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 1036a8e1..17e92dd7 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -7,12 +7,11 @@ mod consul; #[cfg(feature = "kubernetes-discovery")] mod kubernetes; -pub mod layout; pub mod graph_algo; +pub mod layout; pub mod ring; pub mod system; - mod metrics; pub mod rpc_helper; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 655d21de..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,7 +565,6 @@ impl System { return Err(Error::Message(msg)); } - let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From e5664c9822c6ed1ecb30cac41b6a4125da3f88e7 Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 11 Oct 2022 17:17:13 +0200 Subject: Improved the statistics displayed in layout show corrected a few bugs --- src/garage/cli/layout.rs | 69 ++++++++++++++++++++----------- src/rpc/layout.rs | 105 +++++++++++++++++++++++++++++------------------ 2 files changed, 111 insertions(+), 63 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index f747fbe4..5056e57d 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -169,7 +169,7 @@ pub async fn cmd_show_layout( rpc_cli: &Endpoint, rpc_host: NodeID, ) -> Result<(), Error> { - let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); if !print_cluster_layout(&layout) { @@ -179,41 +179,40 @@ pub async fn cmd_show_layout( println!(); println!("Current cluster layout version: {}", layout.version); - if print_staging_role_changes(&layout) { - layout.roles.merge(&layout.staging); - - println!(); - println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - if !print_cluster_layout(&layout) { - println!("No nodes have a role in the new layout."); - } - println!(); - - println!("==== PARAMETERS OF THE LAYOUT COMPUTATION ===="); - println!( - "Zone redundancy: {}", - layout.staged_parameters.get().zone_redundancy - ); - println!(); + let has_role_changes = print_staging_role_changes(&layout); + let has_param_changes = print_staging_parameters_changes(&layout); + if has_role_changes || has_param_changes { + let v = layout.version; + let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions // will move around when we apply - match layout.calculate_partition_assignation() { - Ok(msg) => { + match res_apply { + Ok((layout, msg)) => { + println!(); + println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); + if !print_cluster_layout(&layout) { + println!("No nodes have a role in the new layout."); + } + println!(); + for line in msg.iter() { println!("{}", line); } println!("To enact the staged role changes, type:"); println!(); - println!(" garage layout apply --version {}", layout.version + 1); + println!(" garage layout apply --version {}", v + 1); println!(); println!( "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1) + v + 1) } Err(Error::Message(s)) => { println!("Error while trying to compute the assignation: {}", s); println!("This new layout cannot yet be applied."); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + v + 1) } _ => { println!("Unknown Error"); @@ -321,21 +320,29 @@ pub async fn send_layout( } pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { - let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; + let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { Some(r) => r, _ => continue, }; let tags = role.tags.join(","); + let usage = layout.get_node_usage(id).unwrap_or(0); + let capacity = layout.get_node_capacity(id).unwrap_or(1); table.push(format!( - "{:?}\t{}\t{}\t{}", + "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", id, tags, role.zone, - role.capacity_string() + role.capacity_string(), + usage as u32 * layout.partition_size, + (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) )); } + println!(); + println!("Parameters of the layout computation:"); + println!("Zone redundancy: {}", layout.parameters.zone_redundancy); + println!(); if table.len() == 1 { false } else { @@ -344,6 +351,20 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } } +pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { + let has_changes = layout.staged_parameters.get().clone() != layout.parameters; + if has_changes { + println!(); + println!("==== NEW LAYOUT PARAMETERS ===="); + println!( + "Zone redundancy: {}", + layout.staged_parameters.get().zone_redundancy + ); + println!(); + } + has_changes +} + pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { let has_changes = layout .staging diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 3a6f42ee..d2ed8af8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -205,6 +205,7 @@ To know the correct value of the new layout version, invoke `garage layout show` self.staging.clear(); self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + self.staged_parameters.update(self.parameters.clone()); self.version += 1; @@ -267,6 +268,26 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + ///Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignation_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + ///Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; @@ -357,11 +378,10 @@ To know the correct value of the new layout version, invoke `garage layout show` //algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); - let partition_size = cl2 - .compute_optimal_partition_size(&zone_to_id) - .expect("Critical Error"); - if partition_size != self.partition_size { - return false; + match cl2.compute_optimal_partition_size(&zone_to_id) { + Ok(s) if s != self.partition_size => return false, + Err(_) => return false, + _ => (), } true @@ -376,8 +396,9 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - /// Staged changes must be merged with nodes roles before calling this function. - pub fn calculate_partition_assignation(&mut self) -> Result { + // Staged role changes must be merged with nodes roles before calling this function, + // hence it must only be called from apply_staged_changes() and it is not public. + fn calculate_partition_assignation(&mut self) -> Result { //The nodes might have been updated, some might have been deleted. //So we need to first update the list of nodes and retrieve the //assignation. @@ -386,13 +407,15 @@ impl ClusterLayout { //changes in the layout. We retrieve the old_assignation reframed with the new ids let old_assignation_opt = self.update_node_id_vec()?; - let redundancy = self.staged_parameters.get().zone_redundancy; + self.parameters = self.staged_parameters.get().clone(); let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); msg.push(format!( - "Computation of a new cluster layout where partitions are \ + "Partitions are \ replicated {} times on at least {} distinct zones.", - self.replication_factor, redundancy + self.replication_factor, self.parameters.zone_redundancy )); //We generate for once numerical ids for the zones of non gateway nodes, @@ -400,11 +423,6 @@ impl ClusterLayout { let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; let nb_useful_nodes = self.useful_nodes().len(); - msg.push(format!( - "The cluster contains {} nodes spread over {} zones.", - nb_useful_nodes, - id_to_zone.len() - )); if nb_useful_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ @@ -412,12 +430,12 @@ impl ClusterLayout { nb_useful_nodes, self.replication_factor ))); } - if id_to_zone.len() < redundancy { + if id_to_zone.len() < self.parameters.zone_redundancy { return Err(Error::Message(format!( "The number of zones with non-gateway \ nodes ({}) is smaller than the redundancy parameter ({})", id_to_zone.len(), - redundancy + self.parameters.zone_redundancy ))); } @@ -429,10 +447,8 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!( - "Given the replication and redundancy constraint, the \ - optimal size of a partition is {}. In the previous layout, it used to \ - be {} (the zone redundancy was {}).", - partition_size, self.partition_size, self.parameters.zone_redundancy + "Optimal size of a partition: {} (was {} in the previous layout).", + partition_size, self.partition_size )); } else { msg.push(format!( @@ -442,7 +458,6 @@ impl ClusterLayout { )); } self.partition_size = partition_size; - self.parameters = self.staged_parameters.get().clone(); if partition_size < 100 { msg.push( @@ -470,6 +485,13 @@ impl ClusterLayout { //We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + + if !self.check() { + return Err(Error::Message( + "Critical error: The computed layout happens to be incorrect".into(), + )); + } + Ok(msg) } @@ -553,12 +575,6 @@ impl ClusterLayout { //We write the ring self.ring_assignation_data = Vec::::new(); - if !self.check() { - return Err(Error::Message( - "Critical error: The computed layout happens to be incorrect".into(), - )); - } - Ok(Some(old_assignation)) } @@ -652,7 +668,7 @@ impl ClusterLayout { ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); - let redundancy = self.staged_parameters.get().zone_redundancy; + let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; g.add_edge( @@ -774,8 +790,9 @@ impl ClusterLayout { let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push("".into()); msg.push(format!( - "Available capacity / Total cluster capacity: {} / {} ({:.1} %)", + "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", used_cap, total_cap, percent_cap )); msg.push("".into()); @@ -878,7 +895,7 @@ impl ClusterLayout { } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); msg.push(format!( - " Available capacity / Total capacity: {}/{} ({:.1}%).", + " Usable capacity / Total capacity: {}/{} ({:.1}%).", available_cap_z, total_cap_z, percent_cap_z )); @@ -891,7 +908,7 @@ impl ClusterLayout { .tags_string(); msg.push(format!( " Node {}: {} partitions ({} new) ; \ - available/total capacity: {} / {} ({:.1}%) ; tags:{}", + usable/total capacity: {} / {} ({:.1}%) ; tags:{}", &self.node_id_vec[*n].to_vec()[0..2] .to_vec() .encode_hex::(), @@ -1008,7 +1025,7 @@ mod tests { cl.node_id_vec.push(x); } - let update = cl.roles.update_mutator( + let update = cl.staging.update_mutator( cl.node_id_vec[i], NodeRoleV(Some(NodeRole { zone: (node_zone_vec[i].to_string()), @@ -1016,9 +1033,11 @@ mod tests { tags: (vec![]), })), ); - cl.roles.merge(&update); + cl.staging.merge(&update); } - cl.staged_parameters = Lww::::new(LayoutParameters { zone_redundancy }); + cl.staging_hash = blake2sum(&rmp_to_vec_all_named(&cl.staging).unwrap()[..]); + cl.staged_parameters + .update(LayoutParameters { zone_redundancy }); } #[test] @@ -1032,7 +1051,9 @@ mod tests { let mut cl = ClusterLayout::new(3); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); @@ -1043,13 +1064,17 @@ mod tests { .map(|x| x.to_string()) .collect(); update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); @@ -1057,7 +1082,9 @@ mod tests { 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, ]; update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - show_msg(&cl.calculate_partition_assignation().unwrap()); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); assert!(cl.check()); assert!(matches!(check_against_naive(&cl), Ok(true))); } -- cgit v1.2.3 From bcdd1e0c3335500a6d0337ce6ee050fb59fc665a Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 11 Oct 2022 18:29:21 +0200 Subject: Added some comment --- src/rpc/graph_algo.rs | 79 ++++++++++++------------ src/rpc/layout.rs | 162 +++++++++++++++++++++++++++++--------------------- 2 files changed, 132 insertions(+), 109 deletions(-) diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 13c60692..5bd6cc51 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -6,10 +6,10 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::collections::VecDeque; -//Vertex data structures used in all the graphs used in layout.rs. -//usize parameters correspond to node/zone/partitions ids. -//To understand the vertex roles below, please refer to the formal description -//of the layout computation algorithm. +///Vertex data structures used in all the graphs used in layout.rs. +///usize parameters correspond to node/zone/partitions ids. +///To understand the vertex roles below, please refer to the formal description +///of the layout computation algorithm. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum Vertex { Source, @@ -20,8 +20,7 @@ pub enum Vertex { Sink, } -//Edge data structure for the flow algorithm. -//The graph is stored as an adjacency list +///Edge data structure for the flow algorithm. #[derive(Clone, Copy, Debug)] pub struct FlowEdge { cap: u32, //flow maximal capacity of the edge @@ -30,8 +29,7 @@ pub struct FlowEdge { rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v } -//Edge data structure for the detection of negative cycles. -//The graph is stored as a list of edges (u,v). +///Edge data structure for the detection of negative cycles. #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { w: i32, //weight of the edge @@ -42,13 +40,14 @@ pub trait Edge: Clone + Copy {} impl Edge for FlowEdge {} impl Edge for WeightedEdge {} -//Struct for the graph structure. We do encapsulation here to be able to both -//provide user friendly Vertex enum to address vertices, and to use usize indices -//and Vec instead of HashMap in the graph algorithm to optimize execution speed. +///Struct for the graph structure. We do encapsulation here to be able to both +///provide user friendly Vertex enum to address vertices, and to use internally usize +///indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. pub struct Graph { vertextoid: HashMap, idtovertex: Vec, + //The graph is stored as an adjacency list graph: Vec>, } @@ -69,8 +68,8 @@ impl Graph { } impl Graph { - //This function adds a directed edge to the graph with capacity c, and the - //corresponding reversed edge with capacity 0. + ///This function adds a directed edge to the graph with capacity c, and the + ///corresponding reversed edge with capacity 0. pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -94,8 +93,8 @@ impl Graph { Ok(()) } - //This function returns the list of vertices that receive a positive flow from - //vertex v. + ///This function returns the list of vertices that receive a positive flow from + ///vertex v. pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { if !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -110,7 +109,7 @@ impl Graph { Ok(result) } - //This function returns the value of the flow incoming to v. + ///This function returns the value of the flow incoming to v. pub fn get_inflow(&self, v: Vertex) -> Result { if !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -123,7 +122,7 @@ impl Graph { Ok(result) } - //This function returns the value of the flow outgoing from v. + ///This function returns the value of the flow outgoing from v. pub fn get_outflow(&self, v: Vertex) -> Result { if !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -136,14 +135,14 @@ impl Graph { Ok(result) } - //This function computes the flow total value by computing the outgoing flow - //from the source. + ///This function computes the flow total value by computing the outgoing flow + ///from the source. pub fn get_flow_value(&mut self) -> Result { self.get_outflow(Vertex::Source) } - //This function shuffles the order of the edge lists. It keeps the ids of the - //reversed edges consistent. + ///This function shuffles the order of the edge lists. It keeps the ids of the + ///reversed edges consistent. fn shuffle_edges(&mut self) { let mut rng = rand::thread_rng(); for i in 0..self.graph.len() { @@ -157,7 +156,7 @@ impl Graph { } } - //Computes an upper bound of the flow n the graph + ///Computes an upper bound of the flow on the graph pub fn flow_upper_bound(&self) -> u32 { let idsource = self.vertextoid[&Vertex::Source]; let mut flow_upper_bound = 0; @@ -167,9 +166,9 @@ impl Graph { flow_upper_bound } - //This function computes the maximal flow using Dinic's algorithm. It starts with - //the flow values already present in the graph. So it is possible to add some edge to - //the graph, compute a flow, add other edges, update the flow. + ///This function computes the maximal flow using Dinic's algorithm. It starts with + ///the flow values already present in the graph. So it is possible to add some edge to + ///the graph, compute a flow, add other edges, update the flow. pub fn compute_maximal_flow(&mut self) -> Result<(), String> { if !self.vertextoid.contains_key(&Vertex::Source) { return Err("The graph does not contain a source.".to_string()); @@ -270,11 +269,11 @@ impl Graph { Ok(()) } - //This function takes a flow, and a cost function on the edges, and tries to find an - // equivalent flow with a better cost, by finding improving overflow cycles. It uses - // as subroutine the Bellman Ford algorithm run up to path_length. - // We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and only - // one needs to be present in the cost function. + ///This function takes a flow, and a cost function on the edges, and tries to find an + /// equivalent flow with a better cost, by finding improving overflow cycles. It uses + /// as subroutine the Bellman Ford algorithm run up to path_length. + /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and + /// only one needs to be present in the cost function. pub fn optimize_flow_with_cost( &mut self, cost: &CostFunction, @@ -309,7 +308,7 @@ impl Graph { Ok(()) } - //Construct the weighted graph G_f from the flow and the cost function + ///Construct the weighted graph G_f from the flow and the cost function fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { let mut g = Graph::::new(&self.idtovertex); let nb_vertices = self.idtovertex.len(); @@ -334,7 +333,7 @@ impl Graph { } impl Graph { - //This function adds a single directed weighted edge to the graph. + ///This function adds a single directed weighted edge to the graph. pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { return Err("The graph does not contain the provided vertex.".to_string()); @@ -345,12 +344,12 @@ impl Graph { Ok(()) } - //This function lists the negative cycles it manages to find after path_length - //iterations of the main loop of the Bellman-Ford algorithm. For the classical - //algorithm, path_length needs to be equal to the number of vertices. However, - //for particular graph structures like our case, the algorithm is still correct - //when path_length is the length of the longest possible simple path. - //See the formal description of the algorithm for more details. + ///This function lists the negative cycles it manages to find after path_length + ///iterations of the main loop of the Bellman-Ford algorithm. For the classical + ///algorithm, path_length needs to be equal to the number of vertices. However, + ///for particular graph structures like in our case, the algorithm is still correct + ///when path_length is the length of the longest possible simple path. + ///See the formal description of the algorithm for more details. fn list_negative_cycles(&self, path_length: usize) -> Vec> { let nb_vertices = self.graph.len(); @@ -384,8 +383,8 @@ impl Graph { } } -//This function returns the list of cycles of a directed 1 forest. It does not -//check for the consistency of the input. +///This function returns the list of cycles of a directed 1 forest. It does not +///check for the consistency of the input. fn cycles_of_1_forest(forest: &[Option]) -> Vec> { let mut cycles = Vec::>::new(); let mut time_of_discovery = vec![None; forest.len()]; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index d2ed8af8..38e56b88 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -17,6 +17,8 @@ use crate::ring::*; use std::convert::TryInto; +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + //The Message type will be used to collect information on the algorithm. type Message = Vec; @@ -28,9 +30,11 @@ pub struct ClusterLayout { pub replication_factor: usize, - //This attribute is only used to retain the previously computed partition size, - //to know to what extent does it change with the layout update. + ///This attribute is only used to retain the previously computed partition size, + ///to know to what extent does it change with the layout update. pub partition_size: u32, + ///Parameters used to compute the assignation currently given by + ///ring_assignation_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -48,8 +52,9 @@ pub struct ClusterLayout { #[serde(with = "serde_bytes")] pub ring_assignation_data: Vec, - /// Role changes which are staged for the next version of the layout + /// Parameters to be used in the next partition assignation computation. pub staged_parameters: Lww, + /// Role changes which are staged for the next version of the layout pub staging: LwwMap, pub staging_hash: Hash, } @@ -65,8 +70,6 @@ impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRoleV(pub Option); @@ -77,12 +80,13 @@ impl AutoCrdt for NodeRoleV { /// The user-assigned roles of cluster nodes #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct NodeRole { - /// Datacenter at which this entry belong. This information might be used to perform a better - /// geodistribution + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution pub zone: String, - /// The (relative) capacity of the node + /// The capacity of the node /// If this is set to None, the node does not participate in storing data for the system /// and is only active as an API gateway to other nodes + // TODO : change the capacity to u64 and use byte unit input/output pub capacity: Option, /// A set of tags to recognize the node pub tags: Vec, @@ -110,6 +114,7 @@ impl NodeRole { } } +//Implementation of the ClusterLayout methods unrelated to the assignation algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { //We set the default zone redundancy to be equal to the replication factor, @@ -231,7 +236,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } ///Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn useful_nodes(&self) -> Vec { + pub fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { @@ -291,13 +296,14 @@ To know the correct value of the new layout version, invoke `garage layout show` ///Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; - for uuid in self.useful_nodes().iter() { + for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; } Ok(total_capacity) } /// Check a cluster layout for internal consistency + /// (assignation, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> bool { // Check that the hash of the staging data is correct @@ -377,7 +383,7 @@ To know the correct value of the new layout version, invoke `garage layout show` //Check that the partition size stored is the one computed by the asignation //algorithm. let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_useful_zone_ids().expect("Critical Error"); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); match cl2.compute_optimal_partition_size(&zone_to_id) { Ok(s) if s != self.partition_size => return false, Err(_) => return false, @@ -388,6 +394,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } +//Implementation of the ClusterLayout methods related to the assignation algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor @@ -397,16 +404,13 @@ impl ClusterLayout { /// the former assignation (if any) to minimize the amount of /// data to be moved. // Staged role changes must be merged with nodes roles before calling this function, - // hence it must only be called from apply_staged_changes() and it is not public. + // hence it must only be called from apply_staged_changes() and hence is not public. fn calculate_partition_assignation(&mut self) -> Result { - //The nodes might have been updated, some might have been deleted. - //So we need to first update the list of nodes and retrieve the - //assignation. - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with the new ids + //changes in the layout. We retrieve the old_assignation reframed with new ids let old_assignation_opt = self.update_node_id_vec()?; + //We update the parameters self.parameters = self.staged_parameters.get().clone(); let mut msg = Message::new(); @@ -420,14 +424,14 @@ impl ClusterLayout { //We generate for once numerical ids for the zones of non gateway nodes, //to use them as indices in the flow graphs. - let (id_to_zone, zone_to_id) = self.generate_useful_zone_ids()?; + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_useful_nodes = self.useful_nodes().len(); - if nb_useful_nodes < self.replication_factor { + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_useful_nodes, self.replication_factor + nb_nongateway_nodes, self.replication_factor ))); } if id_to_zone.len() < self.parameters.zone_redundancy { @@ -457,6 +461,7 @@ impl ClusterLayout { partition_size )); } + //We write the partition size. self.partition_size = partition_size; if partition_size < 100 { @@ -467,14 +472,15 @@ impl ClusterLayout { ); } - //We compute a first flow/assignment that is heuristically close to the previous - //assignment - let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignation_opt)?; + //We compute a first flow/assignation that is heuristically close to the previous + //assignation + let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignment. + //We minimize the distance to the previous assignation. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } + //We display statistics of the computation msg.append(&mut self.output_stat( &gflow, &old_assignation_opt, @@ -538,14 +544,13 @@ impl ClusterLayout { // (2) We retrieve the old association //We rewrite the old association with the new indices. We only consider partition //to node assignations where the node is still in use. - let nb_partitions = 1usize << PARTITION_BITS; - let mut old_assignation = vec![Vec::::new(); nb_partitions]; + let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; if self.ring_assignation_data.is_empty() { //This is a new association return Ok(None); } - if self.ring_assignation_data.len() != nb_partitions * self.replication_factor { + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "The old assignation does not have a size corresponding to \ the old replication factor or the number of partitions." @@ -580,11 +585,11 @@ impl ClusterLayout { ///This function generates ids for the zone of the nodes appearing in ///self.node_id_vec. - fn generate_useful_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - for uuid in self.useful_nodes().iter() { + for uuid in self.nongateway_nodes().iter() { if self.roles.get(uuid) == None { return Err(Error::Message( "The uuid was not found in the node roles (this should \ @@ -603,17 +608,16 @@ impl ClusterLayout { } ///This function computes by dichotomy the largest realizable partition size, given - ///the layout. + ///the layout roles and parameters. fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, ) -> Result { - let nb_partitions = 1usize << PARTITION_BITS; let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? - < (nb_partitions * self.replication_factor) + < (NB_PARTITIONS * self.replication_factor) .try_into() .unwrap() { @@ -630,7 +634,7 @@ impl ClusterLayout { g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; if g.get_flow_value()? - < (nb_partitions * self.replication_factor) + < (NB_PARTITIONS * self.replication_factor) .try_into() .unwrap() { @@ -658,14 +662,21 @@ impl ClusterLayout { vertices } + ///Generates the graph to compute the maximal flow corresponding to the optimal + ///partition assignation. + ///exclude_assoc is the set of (partition, node) association that we are forbidden + ///to use (hence we do not add the corresponding edge to the graph). This parameter + ///is used to compute a first flow that uses only edges appearing in the previous + ///assignation. This produces a solution that heuristically should be close to the + ///previous one. fn generate_flow_graph( &self, - size: u32, + partition_size: u32, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, ) -> Result, Error> { let vertices = - ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.useful_nodes().len()); + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); let redundancy = self.parameters.zone_redundancy; @@ -685,10 +696,10 @@ impl ClusterLayout { )?; } } - for n in 0..self.useful_nodes().len() { + for n in 0..self.nongateway_nodes().len() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / size)?; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; @@ -698,28 +709,34 @@ impl ClusterLayout { Ok(g) } - fn compute_candidate_assignment( + ///This function computes a first optimal assignation (in the form of a flow graph). + fn compute_candidate_assignation( &self, zone_to_id: &HashMap, - old_assoc_opt: &Option>>, + prev_assign_opt: &Option>>, ) -> Result, Error> { - //We list the edges that are not used in the old association + //We list the (partition,node) associations that are not used in the + //previous assignation let mut exclude_edge = HashSet::<(usize, usize)>::new(); - if let Some(old_assoc) = old_assoc_opt { - let nb_nodes = self.useful_nodes().len(); - for (p, old_assoc_p) in old_assoc.iter().enumerate() { + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); } - for n in old_assoc_p.iter() { + for n in prev_assign_p.iter() { exclude_edge.remove(&(p, *n)); } } } - //We compute the best flow using only the edges used in the old assoc + //We compute the best flow using only the edges used in the previous assignation let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; + + //We add the excluded edges and compute the maximal flow with the full graph. + //The algorithm is such that it will start with the flow that we just computed + //and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; @@ -728,26 +745,35 @@ impl ClusterLayout { Ok(g) } + ///This function updates the flow graph gflow to minimize the distance between + ///its corresponding assignation and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, zone_to_id: &HashMap, - old_assoc: &[Vec], + prev_assign: &[Vec], ) -> Result<(), Error> { + //We define a cost function on the edges (pairs of vertices) corresponding + //to the distance between the two assignations. let mut cost = CostFunction::new(); - for (p, assoc_p) in old_assoc.iter().enumerate() { + for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } - let nb_nodes = self.useful_nodes().len(); + + //We compute the maximal length of a simple path in gflow. It is used in the + //Bellman-Ford algorithm in optimize_flow_with_cost to set the number + //of iterations. + let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; Ok(()) } + ///This function updates the assignation ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, @@ -775,19 +801,18 @@ impl ClusterLayout { Ok(()) } - //This function returns a message summing up the partition repartition of the new - //layout. + ///This function returns a message summing up the partition repartition of the new + ///layout, and other statistics of the partition assignation computation. fn output_stat( &self, gflow: &Graph, - old_assoc_opt: &Option>>, + prev_assign_opt: &Option>>, zone_to_id: &HashMap, id_to_zone: &[String], ) -> Result { let mut msg = Message::new(); - let nb_partitions = 1usize << PARTITION_BITS; - let used_cap = self.partition_size * nb_partitions as u32 * self.replication_factor as u32; + let used_cap = self.partition_size * NB_PARTITIONS as u32 * self.replication_factor as u32; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push("".into()); @@ -813,21 +838,21 @@ impl ClusterLayout { )); //We define and fill in the following tables - let storing_nodes = self.useful_nodes(); + let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; let mut new_partitions_zone = vec![0; id_to_zone.len()]; let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - for p in 0..nb_partitions { + for p in 0..NB_PARTITIONS { for z in 0..id_to_zone.len() { let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; if !pz_nodes.is_empty() { stored_partitions_zone[z] += 1; - if let Some(old_assoc) = old_assoc_opt { + if let Some(prev_assign) = prev_assign_opt { let mut old_zones_of_p = Vec::::new(); - for n in old_assoc[p].iter() { + for n in prev_assign[p].iter() { old_zones_of_p .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); } @@ -839,8 +864,8 @@ impl ClusterLayout { for vert in pz_nodes.iter() { if let Vertex::N(n) = *vert { stored_partitions[n] += 1; - if let Some(old_assoc) = old_assoc_opt { - if !old_assoc[p].contains(&n) { + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { new_partitions[n] += 1; } } @@ -849,7 +874,7 @@ impl ClusterLayout { } } - if *old_assoc_opt == None { + if *prev_assign_opt == None { new_partitions = stored_partitions.clone(); new_partitions_zone = stored_partitions_zone.clone(); } @@ -857,7 +882,7 @@ impl ClusterLayout { //We display the statistics msg.push("".into()); - if *old_assoc_opt != None { + if *prev_assign_opt != None { let total_new_partitions: usize = new_partitions.iter().sum(); msg.push(format!( "A total of {} new copies of partitions need to be \ @@ -950,9 +975,8 @@ mod tests { fn check_against_naive(cl: &ClusterLayout) -> Result { let over_size = cl.partition_size + 1; let mut zone_token = HashMap::::new(); - let nb_partitions = 1usize << PARTITION_BITS; - let (zones, zone_to_id) = cl.generate_useful_zone_ids()?; + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; if zones.is_empty() { return Ok(false); @@ -961,12 +985,12 @@ mod tests { for z in zones.iter() { zone_token.insert(z.clone(), 0); } - for uuid in cl.useful_nodes().iter() { + for uuid in cl.nongateway_nodes().iter() { let z = cl.get_node_zone(uuid)?; let c = cl.get_node_capacity(uuid)?; zone_token.insert( z.clone(), - zone_token[&z] + min(nb_partitions, (c / over_size) as usize), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), ); } @@ -978,15 +1002,15 @@ mod tests { id_zone_token[zone_to_id[z]] = *t; } - let mut nb_token = vec![0; nb_partitions]; - let mut last_zone = vec![zones.len(); nb_partitions]; + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; let mut curr_zone = 0; let redundancy = cl.parameters.zone_redundancy; for replic in 0..cl.replication_factor { - for p in 0..nb_partitions { + for p in 0..NB_PARTITIONS { while id_zone_token[curr_zone] == 0 || (last_zone[p] == curr_zone && redundancy - nb_token[p] <= cl.replication_factor - replic) -- cgit v1.2.3 From 3039bb5d431532f0ec907eab5e00f94acc4a3472 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 13 Oct 2022 12:40:42 +0200 Subject: rm .gitattributes --- .gitattributes | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index b634d85f..00000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.pdf filter=lfs diff=lfs merge=lfs -text -- cgit v1.2.3 From ea5afc251106b3f6e2d07f942ba1f88abeef8765 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 19:34:40 +0100 Subject: Style improvements --- src/api/admin/cluster.rs | 6 +- src/garage/cli/cmd.rs | 2 +- src/garage/cli/layout.rs | 20 ++-- src/rpc/graph_algo.rs | 273 +++++++++++++++++++++++------------------------ src/rpc/layout.rs | 247 +++++++++++++++++++++--------------------- 5 files changed, 271 insertions(+), 277 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 61bfb8c5..040778b1 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -86,7 +86,7 @@ fn get_cluster_layout(garage: &Arc) -> GetClusterLayoutResponse { .map(|(k, _, v)| (hex::encode(k), v.0.clone())) .collect(), staged_role_changes: layout - .staging + .staging_roles .items() .iter() .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) @@ -137,14 +137,14 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.get_cluster_layout(); let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); for (node, role) in updates { let node = hex::decode(node).ok_or_bad_request("Invalid node identifier")?; let node = Uuid::try_from(&node).ok_or_bad_request("Invalid node identifier")?; layout - .staging + .staging_roles .merge(&roles.update_mutator(node, NodeRoleV(role))); } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c8b96489..e352ddf2 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -71,7 +71,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } _ => { - let new_role = match layout.staging.get(&adv.id) { + let new_role = match layout.staging_roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "(pending)", _ => "NO ROLE ASSIGNED", }; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 5056e57d..4b23a096 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -63,14 +63,14 @@ pub async fn cmd_assign_role( .collect::, _>>()?; let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); for replaced in args.replace.iter() { let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging + .staging_roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -128,7 +128,7 @@ pub async fn cmd_assign_role( }; layout - .staging + .staging_roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -148,13 +148,13 @@ pub async fn cmd_remove_role( let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging + .staging_roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -278,7 +278,7 @@ pub async fn cmd_config_layout( println!("The zone redundancy must be at least 1."); } else { layout - .staged_parameters + .staging_parameters .update(LayoutParameters { zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } @@ -352,13 +352,13 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { - let has_changes = layout.staged_parameters.get().clone() != layout.parameters; + let has_changes = layout.staging_parameters.get().clone() != layout.parameters; if has_changes { println!(); println!("==== NEW LAYOUT PARAMETERS ===="); println!( "Zone redundancy: {}", - layout.staged_parameters.get().zone_redundancy + layout.staging_parameters.get().zone_redundancy ); println!(); } @@ -367,7 +367,7 @@ pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { let has_changes = layout - .staging + .staging_roles .items() .iter() .any(|(k, _, v)| layout.roles.get(k) != Some(v)); @@ -376,7 +376,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { println!(); println!("==== STAGED ROLE CHANGES ===="); let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging.items().iter() { + for (id, _, role) in layout.staging_roles.items().iter() { if layout.roles.get(id) == Some(role) { continue; } diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 5bd6cc51..1e4a819b 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -6,33 +6,33 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::collections::VecDeque; -///Vertex data structures used in all the graphs used in layout.rs. -///usize parameters correspond to node/zone/partitions ids. -///To understand the vertex roles below, please refer to the formal description -///of the layout computation algorithm. +/// Vertex data structures used in all the graphs used in layout.rs. +/// usize parameters correspond to node/zone/partitions ids. +/// To understand the vertex roles below, please refer to the formal description +/// of the layout computation algorithm. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum Vertex { Source, - Pup(usize), //The vertex p+ of partition p - Pdown(usize), //The vertex p- of partition p - PZ(usize, usize), //The vertex corresponding to x_(partition p, zone z) - N(usize), //The vertex corresponding to node n + Pup(usize), // The vertex p+ of partition p + Pdown(usize), // The vertex p- of partition p + PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) + N(usize), // The vertex corresponding to node n Sink, } -///Edge data structure for the flow algorithm. +/// Edge data structure for the flow algorithm. #[derive(Clone, Copy, Debug)] pub struct FlowEdge { - cap: u32, //flow maximal capacity of the edge - flow: i32, //flow value on the edge - dest: usize, //destination vertex id - rev: usize, //index of the reversed edge (v, self) in the edge list of vertex v + cap: u32, // flow maximal capacity of the edge + flow: i32, // flow value on the edge + dest: usize, // destination vertex id + rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v } -///Edge data structure for the detection of negative cycles. +/// Edge data structure for the detection of negative cycles. #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { - w: i32, //weight of the edge + w: i32, // weight of the edge dest: usize, } @@ -40,14 +40,14 @@ pub trait Edge: Clone + Copy {} impl Edge for FlowEdge {} impl Edge for WeightedEdge {} -///Struct for the graph structure. We do encapsulation here to be able to both -///provide user friendly Vertex enum to address vertices, and to use internally usize -///indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. +/// Struct for the graph structure. We do encapsulation here to be able to both +/// provide user friendly Vertex enum to address vertices, and to use internally usize +/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. pub struct Graph { - vertextoid: HashMap, - idtovertex: Vec, + vertex_to_id: HashMap, + id_to_vertex: Vec, - //The graph is stored as an adjacency list + // The graph is stored as an adjacency list graph: Vec>, } @@ -60,22 +60,30 @@ impl Graph { map.insert(*vert, i); } Graph:: { - vertextoid: map, - idtovertex: vertices.to_vec(), + vertex_to_id: map, + id_to_vertex: vertices.to_vec(), graph: vec![Vec::::new(); vertices.len()], } } + + fn get_vertex_id(&self, v: &Vertex) -> Result { + self.vertex_to_id + .get(v) + .cloned() + .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) + } } impl Graph { - ///This function adds a directed edge to the graph with capacity c, and the - ///corresponding reversed edge with capacity 0. + /// This function adds a directed edge to the graph with capacity c, and the + /// corresponding reversed edge with capacity 0. pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + if idu == idv { + return Err("Cannot add edge from vertex to itself in flow graph".into()); } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; + let rev_u = self.graph[idu].len(); let rev_v = self.graph[idv].len(); self.graph[idu].push(FlowEdge { @@ -93,28 +101,22 @@ impl Graph { Ok(()) } - ///This function returns the list of vertices that receive a positive flow from - ///vertex v. + /// This function returns the list of vertices that receive a positive flow from + /// vertex v. pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; + let idv = self.get_vertex_id(&v)?; let mut result = Vec::::new(); for edge in self.graph[idv].iter() { if edge.flow > 0 { - result.push(self.idtovertex[edge.dest]); + result.push(self.id_to_vertex[edge.dest]); } } Ok(result) } - ///This function returns the value of the flow incoming to v. + /// This function returns the value of the flow incoming to v. pub fn get_inflow(&self, v: Vertex) -> Result { - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; + let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { result += max(0, self.graph[edge.dest][edge.rev].flow); @@ -122,12 +124,9 @@ impl Graph { Ok(result) } - ///This function returns the value of the flow outgoing from v. + /// This function returns the value of the flow outgoing from v. pub fn get_outflow(&self, v: Vertex) -> Result { - if !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idv = self.vertextoid[&v]; + let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { result += max(0, edge.flow); @@ -135,19 +134,19 @@ impl Graph { Ok(result) } - ///This function computes the flow total value by computing the outgoing flow - ///from the source. + /// This function computes the flow total value by computing the outgoing flow + /// from the source. pub fn get_flow_value(&mut self) -> Result { self.get_outflow(Vertex::Source) } - ///This function shuffles the order of the edge lists. It keeps the ids of the - ///reversed edges consistent. + /// This function shuffles the order of the edge lists. It keeps the ids of the + /// reversed edges consistent. fn shuffle_edges(&mut self) { let mut rng = rand::thread_rng(); for i in 0..self.graph.len() { self.graph[i].shuffle(&mut rng); - //We need to update the ids of the reverse edges. + // We need to update the ids of the reverse edges. for j in 0..self.graph[i].len() { let target_v = self.graph[i][j].dest; let target_rev = self.graph[i][j].rev; @@ -156,97 +155,86 @@ impl Graph { } } - ///Computes an upper bound of the flow on the graph - pub fn flow_upper_bound(&self) -> u32 { - let idsource = self.vertextoid[&Vertex::Source]; + /// Computes an upper bound of the flow on the graph + pub fn flow_upper_bound(&self) -> Result { + let idsource = self.get_vertex_id(&Vertex::Source)?; let mut flow_upper_bound = 0; for edge in self.graph[idsource].iter() { flow_upper_bound += edge.cap; } - flow_upper_bound + Ok(flow_upper_bound) } - ///This function computes the maximal flow using Dinic's algorithm. It starts with - ///the flow values already present in the graph. So it is possible to add some edge to - ///the graph, compute a flow, add other edges, update the flow. + /// This function computes the maximal flow using Dinic's algorithm. It starts with + /// the flow values already present in the graph. So it is possible to add some edge to + /// the graph, compute a flow, add other edges, update the flow. pub fn compute_maximal_flow(&mut self) -> Result<(), String> { - if !self.vertextoid.contains_key(&Vertex::Source) { - return Err("The graph does not contain a source.".to_string()); - } - if !self.vertextoid.contains_key(&Vertex::Sink) { - return Err("The graph does not contain a sink.".to_string()); - } - - let idsource = self.vertextoid[&Vertex::Source]; - let idsink = self.vertextoid[&Vertex::Sink]; + let idsource = self.get_vertex_id(&Vertex::Source)?; + let idsink = self.get_vertex_id(&Vertex::Sink)?; let nb_vertices = self.graph.len(); - let flow_upper_bound = self.flow_upper_bound(); + let flow_upper_bound = self.flow_upper_bound()?; - //To ensure the dispersion of the associations generated by the - //assignation, we shuffle the neighbours of the nodes. Hence, - //the vertices do not consider their neighbours in the same order. + // To ensure the dispersion of the associations generated by the + // assignation, we shuffle the neighbours of the nodes. Hence, + // the vertices do not consider their neighbours in the same order. self.shuffle_edges(); - //We run Dinic's max flow algorithm + // We run Dinic's max flow algorithm loop { - //We build the level array from Dinic's algorithm. + // We build the level array from Dinic's algorithm. let mut level = vec![None; nb_vertices]; let mut fifo = VecDeque::new(); fifo.push_back((idsource, 0)); - while !fifo.is_empty() { - if let Some((id, lvl)) = fifo.pop_front() { - if level[id] == None { - //it means id has not yet been reached - level[id] = Some(lvl); - for edge in self.graph[id].iter() { - if edge.cap as i32 - edge.flow > 0 { - fifo.push_back((edge.dest, lvl + 1)); - } + while let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { + // it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i32 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); } } } } if level[idsink] == None { - //There is no residual flow + // There is no residual flow break; } - //Now we run DFS respecting the level array + // Now we run DFS respecting the level array let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = VecDeque::new(); + let mut lifo = Vec::new(); - lifo.push_back((idsource, flow_upper_bound)); + lifo.push((idsource, flow_upper_bound)); - while let Some((id_tmp, f_tmp)) = lifo.back() { - let id = *id_tmp; - let f = *f_tmp; + while let Some((id, f)) = lifo.last().cloned() { if id == idsink { - //The DFS reached the sink, we can add a - //residual flow. - lifo.pop_back(); - while let Some((id, _)) = lifo.pop_back() { + // The DFS reached the sink, we can add a + // residual flow. + lifo.pop(); + while let Some((id, _)) = lifo.pop() { let nbd = next_nbd[id]; self.graph[id][nbd].flow += f as i32; let id_rev = self.graph[id][nbd].dest; let nbd_rev = self.graph[id][nbd].rev; self.graph[id_rev][nbd_rev].flow -= f as i32; } - lifo.push_back((idsource, flow_upper_bound)); + lifo.push((idsource, flow_upper_bound)); continue; } - //else we did not reach the sink + // else we did not reach the sink let nbd = next_nbd[id]; if nbd >= self.graph[id].len() { - //There is nothing to explore from id anymore - lifo.pop_back(); - if let Some((parent, _)) = lifo.back() { + // There is nothing to explore from id anymore + lifo.pop(); + if let Some((parent, _)) = lifo.last() { next_nbd[*parent] += 1; } continue; } - //else we can try to send flow from id to its nbd + // else we can try to send flow from id to its nbd let new_flow = min( f as i32, self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow, @@ -257,19 +245,19 @@ impl Graph { } if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { if lvldest <= lvlid { - //We cannot send flow to nbd. + // We cannot send flow to nbd. next_nbd[id] += 1; continue; } } - //otherwise, we send flow to nbd. - lifo.push_back((self.graph[id][nbd].dest, new_flow)); + // otherwise, we send flow to nbd. + lifo.push((self.graph[id][nbd].dest, new_flow)); } } Ok(()) } - ///This function takes a flow, and a cost function on the edges, and tries to find an + /// This function takes a flow, and a cost function on the edges, and tries to find an /// equivalent flow with a better cost, by finding improving overflow cycles. It uses /// as subroutine the Bellman Ford algorithm run up to path_length. /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and @@ -279,19 +267,19 @@ impl Graph { cost: &CostFunction, path_length: usize, ) -> Result<(), String> { - //We build the weighted graph g where we will look for negative cycle + // We build the weighted graph g where we will look for negative cycle let mut gf = self.build_cost_graph(cost)?; let mut cycles = gf.list_negative_cycles(path_length); while !cycles.is_empty() { - //we enumerate negative cycles + // we enumerate negative cycles for c in cycles.iter() { for i in 0..c.len() { - //We add one flow unit to the edge (u,v) of cycle c - let idu = self.vertextoid[&c[i]]; - let idv = self.vertextoid[&c[(i + 1) % c.len()]]; + // We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertex_to_id[&c[i]]; + let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; for j in 0..self.graph[idu].len() { - //since idu appears at most once in the cycles, we enumerate every - //edge at most once. + // since idu appears at most once in the cycles, we enumerate every + // edge at most once. let edge = self.graph[idu][j]; if edge.dest == idv { self.graph[idu][j].flow += 1; @@ -308,16 +296,16 @@ impl Graph { Ok(()) } - ///Construct the weighted graph G_f from the flow and the cost function + /// Construct the weighted graph G_f from the flow and the cost function fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { - let mut g = Graph::::new(&self.idtovertex); - let nb_vertices = self.idtovertex.len(); + let mut g = Graph::::new(&self.id_to_vertex); + let nb_vertices = self.id_to_vertex.len(); for i in 0..nb_vertices { for edge in self.graph[i].iter() { if edge.cap as i32 - edge.flow > 0 { - //It is possible to send overflow through this edge - let u = self.idtovertex[i]; - let v = self.idtovertex[edge.dest]; + // It is possible to send overflow through this edge + let u = self.id_to_vertex[i]; + let v = self.id_to_vertex[edge.dest]; if cost.contains_key(&(u, v)) { g.add_edge(u, v, cost[&(u, v)])?; } else if cost.contains_key(&(v, u)) { @@ -333,29 +321,26 @@ impl Graph { } impl Graph { - ///This function adds a single directed weighted edge to the graph. + /// This function adds a single directed weighted edge to the graph. pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { - if !self.vertextoid.contains_key(&u) || !self.vertextoid.contains_key(&v) { - return Err("The graph does not contain the provided vertex.".to_string()); - } - let idu = self.vertextoid[&u]; - let idv = self.vertextoid[&v]; + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; self.graph[idu].push(WeightedEdge { w, dest: idv }); Ok(()) } - ///This function lists the negative cycles it manages to find after path_length - ///iterations of the main loop of the Bellman-Ford algorithm. For the classical - ///algorithm, path_length needs to be equal to the number of vertices. However, - ///for particular graph structures like in our case, the algorithm is still correct - ///when path_length is the length of the longest possible simple path. - ///See the formal description of the algorithm for more details. + /// This function lists the negative cycles it manages to find after path_length + /// iterations of the main loop of the Bellman-Ford algorithm. For the classical + /// algorithm, path_length needs to be equal to the number of vertices. However, + /// for particular graph structures like in our case, the algorithm is still correct + /// when path_length is the length of the longest possible simple path. + /// See the formal description of the algorithm for more details. fn list_negative_cycles(&self, path_length: usize) -> Vec> { let nb_vertices = self.graph.len(); - //We start with every vertex at distance 0 of some imaginary extra -1 vertex. + // We start with every vertex at distance 0 of some imaginary extra -1 vertex. let mut distance = vec![0; nb_vertices]; - //The prev vector collects for every vertex from where does the shortest path come + // The prev vector collects for every vertex from where does the shortest path come let mut prev = vec![None; nb_vertices]; for _ in 0..path_length + 1 { @@ -369,29 +354,35 @@ impl Graph { } } - //If self.graph contains a negative cycle, then at this point the graph described - //by prev (which is a directed 1-forest/functional graph) - //must contain a cycle. We list the cycles of prev. + // If self.graph contains a negative cycle, then at this point the graph described + // by prev (which is a directed 1-forest/functional graph) + // must contain a cycle. We list the cycles of prev. let cycles_prev = cycles_of_1_forest(&prev); - //Remark that the cycle in prev is in the reverse order compared to the cycle - //in the graph. Thus the .rev(). + // Remark that the cycle in prev is in the reverse order compared to the cycle + // in the graph. Thus the .rev(). return cycles_prev .iter() - .map(|cycle| cycle.iter().rev().map(|id| self.idtovertex[*id]).collect()) + .map(|cycle| { + cycle + .iter() + .rev() + .map(|id| self.id_to_vertex[*id]) + .collect() + }) .collect(); } } -///This function returns the list of cycles of a directed 1 forest. It does not -///check for the consistency of the input. +/// This function returns the list of cycles of a directed 1 forest. It does not +/// check for the consistency of the input. fn cycles_of_1_forest(forest: &[Option]) -> Vec> { let mut cycles = Vec::>::new(); let mut time_of_discovery = vec![None; forest.len()]; for t in 0..forest.len() { let mut id = t; - //while we are on a valid undiscovered node + // while we are on a valid undiscovered node while time_of_discovery[id] == None { time_of_discovery[id] = Some(t); if let Some(i) = forest[id] { @@ -401,8 +392,8 @@ fn cycles_of_1_forest(forest: &[Option]) -> Vec> { } } if forest[id] != None && time_of_discovery[id] == Some(t) { - //We discovered an id that we explored at this iteration t. - //It means we are on a cycle + // We discovered an id that we explored at this iteration t. + // It means we are on a cycle let mut cy = vec![id; 1]; let mut id2 = id; while let Some(id_next) = forest[id2] { diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 38e56b88..95f69dc8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -19,7 +19,7 @@ use std::convert::TryInto; const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; -//The Message type will be used to collect information on the algorithm. +// The Message type will be used to collect information on the algorithm. type Message = Vec; /// The layout of the cluster, i.e. the list of roles @@ -30,11 +30,11 @@ pub struct ClusterLayout { pub replication_factor: usize, - ///This attribute is only used to retain the previously computed partition size, - ///to know to what extent does it change with the layout update. + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. pub partition_size: u32, - ///Parameters used to compute the assignation currently given by - ///ring_assignation_data + /// Parameters used to compute the assignation currently given by + /// ring_assignation_data pub parameters: LayoutParameters, pub roles: LwwMap, @@ -53,14 +53,14 @@ pub struct ClusterLayout { pub ring_assignation_data: Vec, /// Parameters to be used in the next partition assignation computation. - pub staged_parameters: Lww, + pub staging_parameters: Lww, /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, + pub staging_roles: LwwMap, pub staging_hash: Hash, } -///This struct is used to set the parameters to be used in the assignation computation -///algorithm. It is stored as a Crdt. +/// This struct is used to set the parameters to be used in the assignation computation +/// algorithm. It is stored as a Crdt. #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub struct LayoutParameters { pub zone_redundancy: usize, @@ -114,20 +114,19 @@ impl NodeRole { } } -//Implementation of the ClusterLayout methods unrelated to the assignation algorithm. +// Implementation of the ClusterLayout methods unrelated to the assignation algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { - //We set the default zone redundancy to be equal to the replication factor, - //i.e. as strict as possible. + // We set the default zone redundancy to be equal to the replication factor, + // i.e. as strict as possible. let parameters = LayoutParameters { zone_redundancy: replication_factor, }; - let staged_parameters = Lww::::new(parameters.clone()); + let staging_parameters = Lww::::new(parameters.clone()); let empty_lwwmap = LwwMap::new(); - let empty_lwwmap_hash = blake2sum(&rmp_to_vec_all_named(&empty_lwwmap).unwrap()[..]); - ClusterLayout { + let mut ret = ClusterLayout { version: 0, replication_factor, partition_size: 0, @@ -135,10 +134,17 @@ impl ClusterLayout { node_id_vec: Vec::new(), ring_assignation_data: Vec::new(), parameters, - staged_parameters, - staging: empty_lwwmap, - staging_hash: empty_lwwmap_hash, - } + staging_parameters, + staging_roles: empty_lwwmap, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + + fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&rmp_to_vec_all_named(&hashed_tuple).unwrap()[..]) } pub fn merge(&mut self, other: &ClusterLayout) -> bool { @@ -148,16 +154,15 @@ impl ClusterLayout { true } Ordering::Equal => { - let param_changed = self.staged_parameters.get() != other.staged_parameters.get(); - self.staged_parameters.merge(&other.staged_parameters); - self.staging.merge(&other.staging); + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); - let new_staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - let stage_changed = new_staging_hash != self.staging_hash; + let new_staging_hash = self.calculate_staging_hash(); + let changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; - stage_changed || param_changed + changed } Ordering::Less => false, } @@ -179,13 +184,14 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.roles.merge(&self.staging); + self.roles.merge(&self.staging_roles); self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = self.staging_parameters.get().clone(); let msg = self.calculate_partition_assignation()?; - self.staging.clear(); - self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -208,9 +214,9 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.staging.clear(); - self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); - self.staged_parameters.update(self.parameters.clone()); + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + self.staging_parameters.update(self.parameters.clone()); self.version += 1; @@ -235,7 +241,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the uuids of the non_gateway nodes in self.node_id_vec. + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. pub fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { @@ -247,7 +253,7 @@ To know the correct value of the new layout version, invoke `garage layout show` result } - ///Given a node uuids, this function returns the label of its zone + /// Given a node uuids, this function returns the label of its zone pub fn get_node_zone(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(role) => Ok(role.zone.clone()), @@ -257,7 +263,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Given a node uuids, this function returns its capacity or fails if it does not have any + /// Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole { @@ -273,7 +279,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - ///Returns the number of partitions associated to this node in the ring + /// Returns the number of partitions associated to this node in the ring pub fn get_node_usage(&self, uuid: &Uuid) -> Result { for (i, id) in self.node_id_vec.iter().enumerate() { if id == uuid { @@ -293,7 +299,7 @@ To know the correct value of the new layout version, invoke `garage layout show` )) } - ///Returns the sum of capacities of non gateway nodes in the cluster + /// Returns the sum of capacities of non gateway nodes in the cluster pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { @@ -307,7 +313,7 @@ To know the correct value of the new layout version, invoke `garage layout show` /// returns true if consistent, false if error pub fn check(&self) -> bool { // Check that the hash of the staging data is correct - let staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { return false; } @@ -346,14 +352,14 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that every partition is associated to distinct nodes + // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); if nodes_of_p.iter().unique().count() != rf { return false; } - //Check that every partition is spread over at least zone_redundancy zones. + // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p.iter().map(|n| { self.get_node_zone(&self.node_id_vec[*n as usize]) .expect("Zone not found.") @@ -364,7 +370,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that the nodes capacities is consistent with the stored partitions + // Check that the nodes capacities is consistent with the stored partitions let mut node_usage = vec![0; MAX_NODE_NUMBER]; for n in self.ring_assignation_data.iter() { node_usage[*n as usize] += 1; @@ -380,8 +386,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - //Check that the partition size stored is the one computed by the asignation - //algorithm. + // Check that the partition size stored is the one computed by the asignation + // algorithm. let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); match cl2.compute_optimal_partition_size(&zone_to_id) { @@ -394,7 +400,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } } -//Implementation of the ClusterLayout methods related to the assignation algorithm. +// Implementation of the ClusterLayout methods related to the assignation algorithm. impl ClusterLayout { /// This function calculates a new partition-to-node assignation. /// The computed assignation respects the node replication factor @@ -403,16 +409,13 @@ impl ClusterLayout { /// Among such optimal assignation, it minimizes the distance to /// the former assignation (if any) to minimize the amount of /// data to be moved. - // Staged role changes must be merged with nodes roles before calling this function, - // hence it must only be called from apply_staged_changes() and hence is not public. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. fn calculate_partition_assignation(&mut self) -> Result { - //We update the node ids, since the node role list might have changed with the - //changes in the layout. We retrieve the old_assignation reframed with new ids + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignation reframed with new ids let old_assignation_opt = self.update_node_id_vec()?; - //We update the parameters - self.parameters = self.staged_parameters.get().clone(); - let mut msg = Message::new(); msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); msg.push("".into()); @@ -422,8 +425,8 @@ impl ClusterLayout { self.replication_factor, self.parameters.zone_redundancy )); - //We generate for once numerical ids for the zones of non gateway nodes, - //to use them as indices in the flow graphs. + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; let nb_nongateway_nodes = self.nongateway_nodes().len(); @@ -443,10 +446,10 @@ impl ClusterLayout { ))); } - //We compute the optimal partition size - //Capacities should be given in a unit so that partition size is at least 100. - //In this case, integer rounding plays a marginal role in the percentages of - //optimality. + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; if old_assignation_opt != None { @@ -461,7 +464,7 @@ impl ClusterLayout { partition_size )); } - //We write the partition size. + // We write the partition size. self.partition_size = partition_size; if partition_size < 100 { @@ -472,15 +475,15 @@ impl ClusterLayout { ); } - //We compute a first flow/assignation that is heuristically close to the previous - //assignation + // We compute a first flow/assignation that is heuristically close to the previous + // assignation let mut gflow = self.compute_candidate_assignation(&zone_to_id, &old_assignation_opt)?; if let Some(assoc) = &old_assignation_opt { - //We minimize the distance to the previous assignation. + // We minimize the distance to the previous assignation. self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; } - //We display statistics of the computation + // We display statistics of the computation msg.append(&mut self.output_stat( &gflow, &old_assignation_opt, @@ -489,7 +492,7 @@ impl ClusterLayout { )?); msg.push("".to_string()); - //We update the layout structure + // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; if !self.check() { @@ -508,8 +511,8 @@ impl ClusterLayout { /// do modify assignation_ring and node_id_vec. fn update_node_id_vec(&mut self) -> Result>>, Error> { // (1) We compute the new node list - //Non gateway nodes should be coded on 8bits, hence they must be first in the list - //We build the new node ids + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids let mut new_non_gateway_nodes: Vec = self .roles .items() @@ -542,12 +545,12 @@ impl ClusterLayout { self.node_id_vec = new_node_id_vec.clone(); // (2) We retrieve the old association - //We rewrite the old association with the new indices. We only consider partition - //to node assignations where the node is still in use. + // We rewrite the old association with the new indices. We only consider partition + // to node assignations where the node is still in use. let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; if self.ring_assignation_data.is_empty() { - //This is a new association + // This is a new association return Ok(None); } if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { @@ -558,11 +561,11 @@ impl ClusterLayout { )); } - //We build a translation table between the uuid and new ids + // We build a translation table between the uuid and new ids let mut uuid_to_new_id = HashMap::::new(); - //We add the indices of only the new non-gateway nodes that can be used in the - //association ring + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring for (i, uuid) in new_node_id_vec.iter().enumerate() { uuid_to_new_id.insert(*uuid, i); } @@ -577,14 +580,14 @@ impl ClusterLayout { } } - //We write the ring + // We write the ring self.ring_assignation_data = Vec::::new(); Ok(Some(old_assignation)) } - ///This function generates ids for the zone of the nodes appearing in - ///self.node_id_vec. + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); @@ -607,8 +610,8 @@ impl ClusterLayout { Ok((id_to_zone, zone_to_id)) } - ///This function computes by dichotomy the largest realizable partition size, given - ///the layout roles and parameters. + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, @@ -662,13 +665,13 @@ impl ClusterLayout { vertices } - ///Generates the graph to compute the maximal flow corresponding to the optimal - ///partition assignation. - ///exclude_assoc is the set of (partition, node) association that we are forbidden - ///to use (hence we do not add the corresponding edge to the graph). This parameter - ///is used to compute a first flow that uses only edges appearing in the previous - ///assignation. This produces a solution that heuristically should be close to the - ///previous one. + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignation. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignation. This produces a solution that heuristically should be close to the + /// previous one. fn generate_flow_graph( &self, partition_size: u32, @@ -709,14 +712,14 @@ impl ClusterLayout { Ok(g) } - ///This function computes a first optimal assignation (in the form of a flow graph). + /// This function computes a first optimal assignation (in the form of a flow graph). fn compute_candidate_assignation( &self, zone_to_id: &HashMap, prev_assign_opt: &Option>>, ) -> Result, Error> { - //We list the (partition,node) associations that are not used in the - //previous assignation + // We list the (partition,node) associations that are not used in the + // previous assignation let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { let nb_nodes = self.nongateway_nodes().len(); @@ -730,13 +733,13 @@ impl ClusterLayout { } } - //We compute the best flow using only the edges used in the previous assignation + // We compute the best flow using only the edges used in the previous assignation let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; g.compute_maximal_flow()?; - //We add the excluded edges and compute the maximal flow with the full graph. - //The algorithm is such that it will start with the flow that we just computed - //and find ameliorating paths from that. + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; @@ -745,16 +748,16 @@ impl ClusterLayout { Ok(g) } - ///This function updates the flow graph gflow to minimize the distance between - ///its corresponding assignation and the previous one + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignation and the previous one fn minimize_rebalance_load( &self, gflow: &mut Graph, zone_to_id: &HashMap, prev_assign: &[Vec], ) -> Result<(), Error> { - //We define a cost function on the edges (pairs of vertices) corresponding - //to the distance between the two assignations. + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignations. let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { @@ -763,9 +766,9 @@ impl ClusterLayout { } } - //We compute the maximal length of a simple path in gflow. It is used in the - //Bellman-Ford algorithm in optimize_flow_with_cost to set the number - //of iterations. + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -773,7 +776,7 @@ impl ClusterLayout { Ok(()) } - ///This function updates the assignation ring from the flow graph. + /// This function updates the assignation ring from the flow graph. fn update_ring_from_flow( &mut self, nb_zones: usize, @@ -801,8 +804,8 @@ impl ClusterLayout { Ok(()) } - ///This function returns a message summing up the partition repartition of the new - ///layout, and other statistics of the partition assignation computation. + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignation computation. fn output_stat( &self, gflow: &Graph, @@ -837,7 +840,7 @@ impl ClusterLayout { used_cap / self.replication_factor as u32 )); - //We define and fill in the following tables + // We define and fill in the following tables let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -879,7 +882,7 @@ impl ClusterLayout { new_partitions_zone = stored_partitions_zone.clone(); } - //We display the statistics + // We display the statistics msg.push("".into()); if *prev_assign_opt != None { @@ -951,27 +954,27 @@ impl ClusterLayout { } } -//==================================================================================== +// ==================================================================================== #[cfg(test)] mod tests { use super::{Error, *}; use std::cmp::min; - //This function checks that the partition size S computed is at least better than the - //one given by a very naive algorithm. To do so, we try to run the naive algorithm - //assuming a partion size of S+1. If we succed, it means that the optimal assignation - //was not optimal. The naive algorithm is the following : - //- we compute the max number of partitions associated to every node, capped at the - //partition number. It gives the number of tokens of every node. - //- every zone has a number of tokens equal to the sum of the tokens of its nodes. - //- we cycle over the partitions and associate zone tokens while respecting the - //zone redundancy constraint. - //NOTE: the naive algorithm is not optimal. Counter example: - //take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - //number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - //With these parameters, the naive algo fails, whereas there is a solution: - //(A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignation + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) fn check_against_naive(cl: &ClusterLayout) -> Result { let over_size = cl.partition_size + 1; let mut zone_token = HashMap::::new(); @@ -994,8 +997,8 @@ mod tests { ); } - //For every partition, we count the number of zone already associated and - //the name of the last zone associated + // For every partition, we count the number of zone already associated and + // the name of the last zone associated let mut id_zone_token = vec![0; zones.len()]; for (z, t) in zone_token.iter() { @@ -1049,7 +1052,7 @@ mod tests { cl.node_id_vec.push(x); } - let update = cl.staging.update_mutator( + let update = cl.staging_roles.update_mutator( cl.node_id_vec[i], NodeRoleV(Some(NodeRole { zone: (node_zone_vec[i].to_string()), @@ -1057,10 +1060,10 @@ mod tests { tags: (vec![]), })), ); - cl.staging.merge(&update); + cl.staging_roles.merge(&update); } - cl.staging_hash = blake2sum(&rmp_to_vec_all_named(&cl.staging).unwrap()[..]); - cl.staged_parameters + cl.staging_hash = cl.calculate_staging_hash(); + cl.staging_parameters .update(LayoutParameters { zone_redundancy }); } -- cgit v1.2.3 From fd5bc142b553d716c8265d83cff0bb633aa09e6b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 20:29:25 +0100 Subject: Ensure .sort() is called before counting unique items --- src/rpc/layout.rs | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 95f69dc8..15765662 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -355,17 +355,22 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that every partition is associated to distinct nodes let rf = self.replication_factor; for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); + nodes_of_p.sort(); if nodes_of_p.iter().unique().count() != rf { return false; } // Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p.iter().map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }); + let mut zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::>(); + zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; - if zones_of_p.unique().count() < redundancy { + if zones_of_p.iter().unique().count() < redundancy { return false; } } @@ -378,9 +383,7 @@ To know the correct value of the new layout version, invoke `garage layout show` for (n, usage) in node_usage.iter().enumerate() { if *usage > 0 { let uuid = self.node_id_vec[n]; - if usage * self.partition_size - > self.get_node_capacity(&uuid).expect("Critical Error") - { + if usage * self.partition_size > self.get_node_capacity(&uuid).unwrap() { return false; } } @@ -389,7 +392,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that the partition size stored is the one computed by the asignation // algorithm. let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().expect("Critical Error"); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); match cl2.compute_optimal_partition_size(&zone_to_id) { Ok(s) if s != self.partition_size => return false, Err(_) => return false, @@ -484,12 +487,7 @@ impl ClusterLayout { } // We display statistics of the computation - msg.append(&mut self.output_stat( - &gflow, - &old_assignation_opt, - &zone_to_id, - &id_to_zone, - )?); + msg.extend(self.output_stat(&gflow, &old_assignation_opt, &zone_to_id, &id_to_zone)?); msg.push("".to_string()); // We update the layout structure -- cgit v1.2.3 From 73a4ca8b1515f95bf7860fc292c12db83d3c6228 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 7 Nov 2022 21:12:11 +0100 Subject: Use bytes as capacity units --- Cargo.lock | 1 + src/garage/cli/layout.rs | 18 +++++++++++--- src/garage/cli/structs.rs | 4 +-- src/rpc/Cargo.toml | 1 + src/rpc/graph_algo.rs | 34 +++++++++++++------------- src/rpc/layout.rs | 62 ++++++++++++++++++++++++----------------------- 6 files changed, 67 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 75c25628..c9f63a19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1215,6 +1215,7 @@ dependencies = [ "arc-swap", "async-trait", "bytes", + "bytesize", "err-derive", "futures", "futures-util", diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 4b23a096..85af345a 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,3 +1,5 @@ +use bytesize::ByteSize; + use garage_util::crdt::Crdt; use garage_util::error::*; use garage_util::formater::format_table; @@ -86,7 +88,7 @@ pub async fn cmd_assign_role( return Err(Error::Message( "-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into())); } - if args.capacity == Some(0) { + if args.capacity == Some(ByteSize::b(0)) { return Err(Error::Message("Invalid capacity value: 0".into())); } @@ -94,7 +96,7 @@ pub async fn cmd_assign_role( let new_entry = match roles.get(&added_node) { Some(NodeRoleV(Some(old))) => { let capacity = match args.capacity { - Some(c) => Some(c), + Some(c) => Some(c.as_u64()), None if args.gateway => None, None => old.capacity, }; @@ -111,7 +113,7 @@ pub async fn cmd_assign_role( } _ => { let capacity = match args.capacity { - Some(c) => Some(c), + Some(c) => Some(c.as_u64()), None if args.gateway => None, None => return Err(Error::Message( "Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())), @@ -265,6 +267,7 @@ pub async fn cmd_config_layout( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let mut did_something = false; match config_opt.redundancy { None => (), Some(r) => { @@ -282,9 +285,16 @@ pub async fn cmd_config_layout( .update(LayoutParameters { zone_redundancy: r }); println!("The new zone redundancy has been saved ({}).", r); } + did_something = true; } } + if !did_something { + return Err(Error::Message( + "Please specify an action for `garage layout config` to do".into(), + )); + } + send_layout(rpc_cli, rpc_host, layout).await?; Ok(()) } @@ -335,7 +345,7 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { tags, role.zone, role.capacity_string(), - usage as u32 * layout.partition_size, + ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) )); } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 64798952..49a1f267 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -114,9 +114,9 @@ pub struct AssignRoleOpt { #[structopt(short = "z", long = "zone")] pub(crate) zone: Option, - /// Capacity (in relative terms) + /// Storage capacity, in bytes (supported suffixes: B, KB, MB, GB, TB, PB) #[structopt(short = "c", long = "capacity")] - pub(crate) capacity: Option, + pub(crate) capacity: Option, /// Gateway-only node #[structopt(short = "g", long = "gateway")] diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 5a427131..1b411c6a 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -18,6 +18,7 @@ garage_util = { version = "0.8.0", path = "../util" } arc-swap = "1.0" bytes = "1.0" +bytesize = "1.1" gethostname = "0.2" hex = "0.4" tracing = "0.1.30" diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs index 1e4a819b..f181e2ba 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/graph_algo.rs @@ -23,8 +23,8 @@ pub enum Vertex { /// Edge data structure for the flow algorithm. #[derive(Clone, Copy, Debug)] pub struct FlowEdge { - cap: u32, // flow maximal capacity of the edge - flow: i32, // flow value on the edge + cap: u64, // flow maximal capacity of the edge + flow: i64, // flow value on the edge dest: usize, // destination vertex id rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v } @@ -32,7 +32,7 @@ pub struct FlowEdge { /// Edge data structure for the detection of negative cycles. #[derive(Clone, Copy, Debug)] pub struct WeightedEdge { - w: i32, // weight of the edge + w: i64, // weight of the edge dest: usize, } @@ -51,7 +51,7 @@ pub struct Graph { graph: Vec>, } -pub type CostFunction = HashMap<(Vertex, Vertex), i32>; +pub type CostFunction = HashMap<(Vertex, Vertex), i64>; impl Graph { pub fn new(vertices: &[Vertex]) -> Self { @@ -77,7 +77,7 @@ impl Graph { impl Graph { /// This function adds a directed edge to the graph with capacity c, and the /// corresponding reversed edge with capacity 0. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u32) -> Result<(), String> { + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { let idu = self.get_vertex_id(&u)?; let idv = self.get_vertex_id(&v)?; if idu == idv { @@ -115,7 +115,7 @@ impl Graph { } /// This function returns the value of the flow incoming to v. - pub fn get_inflow(&self, v: Vertex) -> Result { + pub fn get_inflow(&self, v: Vertex) -> Result { let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { @@ -125,7 +125,7 @@ impl Graph { } /// This function returns the value of the flow outgoing from v. - pub fn get_outflow(&self, v: Vertex) -> Result { + pub fn get_outflow(&self, v: Vertex) -> Result { let idv = self.get_vertex_id(&v)?; let mut result = 0; for edge in self.graph[idv].iter() { @@ -136,7 +136,7 @@ impl Graph { /// This function computes the flow total value by computing the outgoing flow /// from the source. - pub fn get_flow_value(&mut self) -> Result { + pub fn get_flow_value(&mut self) -> Result { self.get_outflow(Vertex::Source) } @@ -156,7 +156,7 @@ impl Graph { } /// Computes an upper bound of the flow on the graph - pub fn flow_upper_bound(&self) -> Result { + pub fn flow_upper_bound(&self) -> Result { let idsource = self.get_vertex_id(&Vertex::Source)?; let mut flow_upper_bound = 0; for edge in self.graph[idsource].iter() { @@ -193,7 +193,7 @@ impl Graph { // it means id has not yet been reached level[id] = Some(lvl); for edge in self.graph[id].iter() { - if edge.cap as i32 - edge.flow > 0 { + if edge.cap as i64 - edge.flow > 0 { fifo.push_back((edge.dest, lvl + 1)); } } @@ -216,10 +216,10 @@ impl Graph { lifo.pop(); while let Some((id, _)) = lifo.pop() { let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i32; + self.graph[id][nbd].flow += f as i64; let id_rev = self.graph[id][nbd].dest; let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i32; + self.graph[id_rev][nbd_rev].flow -= f as i64; } lifo.push((idsource, flow_upper_bound)); continue; @@ -236,9 +236,9 @@ impl Graph { } // else we can try to send flow from id to its nbd let new_flow = min( - f as i32, - self.graph[id][nbd].cap as i32 - self.graph[id][nbd].flow, - ) as u32; + f as i64, + self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, + ) as u64; if new_flow == 0 { next_nbd[id] += 1; continue; @@ -302,7 +302,7 @@ impl Graph { let nb_vertices = self.id_to_vertex.len(); for i in 0..nb_vertices { for edge in self.graph[i].iter() { - if edge.cap as i32 - edge.flow > 0 { + if edge.cap as i64 - edge.flow > 0 { // It is possible to send overflow through this edge let u = self.id_to_vertex[i]; let v = self.id_to_vertex[edge.dest]; @@ -322,7 +322,7 @@ impl Graph { impl Graph { /// This function adds a single directed weighted edge to the graph. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i32) -> Result<(), String> { + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { let idu = self.get_vertex_id(&u)?; let idv = self.get_vertex_id(&v)?; self.graph[idu].push(WeightedEdge { w, dest: idv }); diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 15765662..3c80b213 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; -use hex::ToHex; +use bytesize::ByteSize; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -32,7 +32,7 @@ pub struct ClusterLayout { /// This attribute is only used to retain the previously computed partition size, /// to know to what extent does it change with the layout update. - pub partition_size: u32, + pub partition_size: u64, /// Parameters used to compute the assignation currently given by /// ring_assignation_data pub parameters: LayoutParameters, @@ -86,8 +86,7 @@ pub struct NodeRole { /// The capacity of the node /// If this is set to None, the node does not participate in storing data for the system /// and is only active as an API gateway to other nodes - // TODO : change the capacity to u64 and use byte unit input/output - pub capacity: Option, + pub capacity: Option, /// A set of tags to recognize the node pub tags: Vec, } @@ -95,7 +94,7 @@ pub struct NodeRole { impl NodeRole { pub fn capacity_string(&self) -> String { match self.capacity { - Some(c) => format!("{}", c), + Some(c) => ByteSize::b(c).to_string_as(false), None => "gateway".to_string(), } } @@ -264,7 +263,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), @@ -300,7 +299,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + pub fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -458,13 +457,14 @@ impl ClusterLayout { if old_assignation_opt != None { msg.push(format!( "Optimal size of a partition: {} (was {} in the previous layout).", - partition_size, self.partition_size + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) )); } else { msg.push(format!( "Given the replication and redundancy constraints, the \ optimal size of a partition is {}.", - partition_size + ByteSize::b(partition_size).to_string_as(false) )); } // We write the partition size. @@ -613,7 +613,7 @@ impl ClusterLayout { fn compute_optimal_partition_size( &self, zone_to_id: &HashMap, - ) -> Result { + ) -> Result { let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; @@ -672,7 +672,7 @@ impl ClusterLayout { /// previous one. fn generate_flow_graph( &self, - partition_size: u32, + partition_size: u64, zone_to_id: &HashMap, exclude_assoc: &HashSet<(usize, usize)>, ) -> Result, Error> { @@ -682,18 +682,18 @@ impl ClusterLayout { let nb_zones = zone_to_id.len(); let redundancy = self.parameters.zone_redundancy; for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u32)?; + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; g.add_edge( Vertex::Source, Vertex::Pdown(p), - (self.replication_factor - redundancy) as u32, + (self.replication_factor - redundancy) as u64, )?; for z in 0..nb_zones { g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; g.add_edge( Vertex::Pdown(p), Vertex::PZ(p, z), - self.replication_factor as u32, + self.replication_factor as u64, )?; } } @@ -813,17 +813,19 @@ impl ClusterLayout { ) -> Result { let mut msg = Message::new(); - let used_cap = self.partition_size * NB_PARTITIONS as u32 * self.replication_factor as u32; + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; let total_cap = self.get_total_capacity()?; let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push("".into()); msg.push(format!( "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", - used_cap, total_cap, percent_cap + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap )); msg.push("".into()); msg.push( - "If the percentage is to low, it might be that the \ + "If the percentage is too low, it might be that the \ replication/redundancy constraints force the use of nodes/zones with small \ storage capacities. \ You might want to rebalance the storage capacities or relax the constraints. \ @@ -833,9 +835,9 @@ impl ClusterLayout { msg.push(format!( "Recall that because of the replication factor, the actual available \ storage capacity is {} / {} = {}.", - used_cap, + ByteSize::b(used_cap).to_string_as(false), self.replication_factor, - used_cap / self.replication_factor as u32 + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) )); // We define and fill in the following tables @@ -914,34 +916,34 @@ impl ClusterLayout { replicated_partitions )); - let available_cap_z: u32 = self.partition_size * replicated_partitions as u32; + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); msg.push(format!( - " Usable capacity / Total capacity: {}/{} ({:.1}%).", - available_cap_z, total_cap_z, percent_cap_z + " Usable capacity / Total capacity: {} / {} ({:.1}%).", + ByteSize::b(available_cap_z).to_string_as(false), + ByteSize::b(total_cap_z).to_string_as(false), + percent_cap_z )); for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u32 * self.partition_size; + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; let tags_n = (self .node_role(&self.node_id_vec[*n]) .ok_or("Node not found."))? .tags_string(); msg.push(format!( - " Node {}: {} partitions ({} new) ; \ + " Node {:?}: {} partitions ({} new) ; \ usable/total capacity: {} / {} ({:.1}%) ; tags:{}", - &self.node_id_vec[*n].to_vec()[0..2] - .to_vec() - .encode_hex::(), + self.node_id_vec[*n], stored_partitions[*n], new_partitions[*n], - available_cap_n, - total_cap_n, + ByteSize::b(available_cap_n).to_string_as(false), + ByteSize::b(total_cap_n).to_string_as(false), (available_cap_n as f32) / (total_cap_n as f32) * 100.0, tags_n )); @@ -1041,7 +1043,7 @@ mod tests { fn update_layout( cl: &mut ClusterLayout, node_id_vec: &Vec, - node_capacity_vec: &Vec, + node_capacity_vec: &Vec, node_zone_vec: &Vec, zone_redundancy: usize, ) { -- cgit v1.2.3 From d75b37b018fc0ce8e3832c8531d9556ff7a345c9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 14:23:08 +0100 Subject: Return more info when layout's .check() fails, fix compilation, fix test --- Cargo.nix | 3 ++- src/api/admin/cluster.rs | 7 ++--- src/db/lib.rs | 2 -- src/garage/cli/layout.rs | 32 ++++++++++++++-------- src/garage/main.rs | 3 +++ src/rpc/layout.rs | 70 +++++++++++++++++++++++++++++++----------------- src/rpc/system.rs | 6 ++--- 7 files changed, 77 insertions(+), 46 deletions(-) diff --git a/Cargo.nix b/Cargo.nix index 29483443..ebe7d53f 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -32,7 +32,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "9b1f88c1c5b4639605886c7135957a8fb750d938f789300ba6dae958cae460d9"; + nixifiedLockHash = "a68c589851ec1990d29cdc20e8b922b27c1a6b402b682f7b0d9a9e6258f25828"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -1736,6 +1736,7 @@ in arc_swap = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".arc-swap."1.5.0" { inherit profileName; }).out; async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.52" { profileName = "__noProfile"; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.2.0" { inherit profileName; }).out; + bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.1.0" { inherit profileName; }).out; ${ if rootFeatures' ? "garage/consul-discovery" || rootFeatures' ? "garage_rpc/consul-discovery" || rootFeatures' ? "garage_rpc/err-derive" then "err_derive" else null } = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.21" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.21" { inherit profileName; }).out; diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 040778b1..7b91f709 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -163,16 +163,13 @@ pub async fn handle_apply_cluster_layout( let layout = garage.system.get_cluster_layout(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - //TODO : how to display msg ? Should it be in the Body Response ? - for s in msg.iter() { - println!("{}", s); - } garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) + .header(http::header::CONTENT_TYPE, "text/plain") + .body(Body::from(msg.join("\n")))?) } pub async fn handle_revert_cluster_layout( diff --git a/src/db/lib.rs b/src/db/lib.rs index 0a776a91..5304c195 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -2,8 +2,6 @@ #[cfg(feature = "sqlite")] extern crate tracing; -#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -//compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); #[cfg(feature = "lmdb")] pub mod lmdb_adapter; #[cfg(feature = "sled")] diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 85af345a..53430e6b 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -330,7 +330,7 @@ pub async fn send_layout( } pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { - let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable".to_string()]; + let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { Some(r) => r, @@ -338,16 +338,26 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { }; let tags = role.tags.join(","); let usage = layout.get_node_usage(id).unwrap_or(0); - let capacity = layout.get_node_capacity(id).unwrap_or(1); - table.push(format!( - "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", - id, - tags, - role.zone, - role.capacity_string(), - ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), - (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) - )); + let capacity = layout.get_node_capacity(id).unwrap_or(0); + if capacity > 0 { + table.push(format!( + "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", + id, + tags, + role.zone, + role.capacity_string(), + ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), + (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) + )); + } else { + table.push(format!( + "{:?}\t{}\t{}\t{}", + id, + tags, + role.zone, + role.capacity_string(), + )); + }; } println!(); println!("Parameters of the layout computation:"); diff --git a/src/garage/main.rs b/src/garage/main.rs index edda734b..8e64273f 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -17,6 +17,9 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled #[cfg(all(feature = "bundled-libs", feature = "system-libs"))] compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); +#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] +compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); + use std::net::SocketAddr; use std::path::PathBuf; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 3c80b213..2f4dc129 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -187,11 +187,11 @@ To know the correct value of the new layout version, invoke `garage layout show` self.roles.retain(|(_, _, v)| v.0.is_some()); self.parameters = self.staging_parameters.get().clone(); - let msg = self.calculate_partition_assignation()?; - self.staging_roles.clear(); self.staging_hash = self.calculate_staging_hash(); + let msg = self.calculate_partition_assignation()?; + self.version += 1; Ok((self, msg)) @@ -214,8 +214,8 @@ To know the correct value of the new layout version, invoke `garage layout show` } self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); self.staging_parameters.update(self.parameters.clone()); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -310,11 +310,11 @@ To know the correct value of the new layout version, invoke `garage layout show` /// Check a cluster layout for internal consistency /// (assignation, roles, parameters, partition size) /// returns true if consistent, false if error - pub fn check(&self) -> bool { + pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { - return false; + return Err("staging_hash is incorrect".into()); } // Check that node_id_vec contains the correct list of nodes @@ -329,12 +329,17 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut node_id_vec = self.node_id_vec.clone(); node_id_vec.sort(); if expected_nodes != node_id_vec { - return false; + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } // Check that the assignation data has the correct length - if self.ring_assignation_data.len() != (1 << PARTITION_BITS) * self.replication_factor { - return false; + let expected_assignation_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignation_data.len() != expected_assignation_data_len { + return Err(format!( + "ring_assignation_data has incorrect length {} instead of {}", + self.ring_assignation_data.len(), + expected_assignation_data_len + )); } // Check that the assigned nodes are correct identifiers @@ -342,12 +347,15 @@ To know the correct value of the new layout version, invoke `garage layout show` // and that role is not the role of a gateway nodes for x in self.ring_assignation_data.iter() { if *x as usize >= self.node_id_vec.len() { - return false; + return Err(format!( + "ring_assignation_data contains invalid node id {}", + *x + )); } let node = self.node_id_vec[*x as usize]; match self.roles.get(&node) { Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return false, + _ => return Err("ring_assignation_data contains id of a gateway node".into()), } } @@ -357,7 +365,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut nodes_of_p = self.ring_assignation_data[rf * p..rf * (p + 1)].to_vec(); nodes_of_p.sort(); if nodes_of_p.iter().unique().count() != rf { - return false; + return Err(format!("partition does not contain {} unique node ids", rf)); } // Check that every partition is spread over at least zone_redundancy zones. let mut zones_of_p = nodes_of_p @@ -370,7 +378,10 @@ To know the correct value of the new layout version, invoke `garage layout show` zones_of_p.sort(); let redundancy = self.parameters.zone_redundancy; if zones_of_p.iter().unique().count() < redundancy { - return false; + return Err(format!( + "nodes of partition are in less than {} distinct zones", + redundancy + )); } } @@ -382,8 +393,14 @@ To know the correct value of the new layout version, invoke `garage layout show` for (n, usage) in node_usage.iter().enumerate() { if *usage > 0 { let uuid = self.node_id_vec[n]; - if usage * self.partition_size > self.get_node_capacity(&uuid).unwrap() { - return false; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); } } } @@ -393,12 +410,17 @@ To know the correct value of the new layout version, invoke `garage layout show` let cl2 = self.clone(); let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); match cl2.compute_optimal_partition_size(&zone_to_id) { - Ok(s) if s != self.partition_size => return false, - Err(_) => return false, + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), _ => (), } - true + Ok(()) } } @@ -493,9 +515,9 @@ impl ClusterLayout { // We update the layout structure self.update_ring_from_flow(id_to_zone.len(), &gflow)?; - if !self.check() { + if let Err(e) = self.check() { return Err(Error::Message( - "Critical error: The computed layout happens to be incorrect".into(), + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) )); } @@ -1062,9 +1084,9 @@ mod tests { ); cl.staging_roles.merge(&update); } - cl.staging_hash = cl.calculate_staging_hash(); cl.staging_parameters .update(LayoutParameters { zone_redundancy }); + cl.staging_hash = cl.calculate_staging_hash(); } #[test] @@ -1081,7 +1103,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; @@ -1094,7 +1116,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; @@ -1102,7 +1124,7 @@ mod tests { let v = cl.version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); node_capacity_vec = vec![ @@ -1112,7 +1134,7 @@ mod tests { let v = cl.version; let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); show_msg(&msg); - assert!(cl.check()); + assert_eq!(cl.check(), Ok(())); assert!(matches!(check_against_naive(&cl), Ok(true))); } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index d6576f20..224fbabb 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,9 +565,9 @@ impl System { let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); - let prev_layout_check = layout.check(); + let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { - if prev_layout_check && !layout.check() { + if prev_layout_check && !layout.check().is_ok() { error!("New cluster layout is invalid, discarding."); return Err(Error::Message( "New cluster layout is invalid, discarding.".into(), @@ -620,7 +620,7 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = !self.ring.borrow().layout.check(); + let not_configured = !self.ring.borrow().layout.check().is_ok(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self -- cgit v1.2.3 From fc2729cd810b94c47d89e9039ea65726c9a85988 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 15:13:37 +0100 Subject: Fix integration test --- script/dev-cluster.sh | 2 +- script/dev-configure.sh | 3 ++- src/garage/tests/common/garage.rs | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/script/dev-cluster.sh b/script/dev-cluster.sh index c7fbe08d..fa0a950e 100755 --- a/script/dev-cluster.sh +++ b/script/dev-cluster.sh @@ -11,7 +11,7 @@ PATH="${GARAGE_DEBUG}:${GARAGE_RELEASE}:${NIX_RELEASE}:$PATH" FANCYCOLORS=("41m" "42m" "44m" "45m" "100m" "104m") export RUST_BACKTRACE=1 -export RUST_LOG=garage=info,garage_api=debug,netapp=trace +export RUST_LOG=garage=info,garage_api=debug MAIN_LABEL="\e[${FANCYCOLORS[0]}[main]\e[49m" WHICH_GARAGE=$(which garage || exit 1) diff --git a/script/dev-configure.sh b/script/dev-configure.sh index f0a7843d..9c24bf4b 100755 --- a/script/dev-configure.sh +++ b/script/dev-configure.sh @@ -25,7 +25,8 @@ garage -c /tmp/config.1.toml status \ | grep 'NO ROLE' \ | grep -Po '^[0-9a-f]+' \ | while read id; do - garage -c /tmp/config.1.toml layout assign $id -z dc1 -c 1 + garage -c /tmp/config.1.toml layout assign $id -z dc1 -c 1G done +garage -c /tmp/config.1.toml layout config -r 1 garage -c /tmp/config.1.toml layout apply --version 1 diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index 44d727f9..a539abb7 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -126,7 +126,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" self.command() .args(["layout", "assign"]) .arg(node_short_id) - .args(["-c", "1", "-z", "unzonned"]) + .args(["-c", "1G", "-z", "unzonned"]) .quiet() .expect_success_status("Could not assign garage node layout"); self.command() -- cgit v1.2.3 From 217abdca18ff15190c0407b2b8b1ea204edcfb99 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 15:38:53 +0100 Subject: Fix HTTP return code --- src/api/admin/cluster.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 7b91f709..4386c0cc 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -167,7 +167,7 @@ pub async fn handle_apply_cluster_layout( garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() - .status(StatusCode::NO_CONTENT) + .status(StatusCode::OK) .header(http::header::CONTENT_TYPE, "text/plain") .body(Body::from(msg.join("\n")))?) } -- cgit v1.2.3 From ec12d6c8ddde0f1dc908e43fef0ecc88d1e5406b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 16:15:45 +0100 Subject: Slightly simplify code at places --- src/garage/cli/layout.rs | 11 ++++----- src/rpc/layout.rs | 61 ++++++++++++++---------------------------------- 2 files changed, 22 insertions(+), 50 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 53430e6b..27bb7eb8 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -209,16 +209,13 @@ pub async fn cmd_show_layout( "You can also revert all proposed changes with: garage layout revert --version {}", v + 1) } - Err(Error::Message(s)) => { - println!("Error while trying to compute the assignation: {}", s); + Err(e) => { + println!("Error while trying to compute the assignation: {}", e); println!("This new layout cannot yet be applied."); println!( "You can also revert all proposed changes with: garage layout revert --version {}", v + 1) } - _ => { - println!("Unknown Error"); - } } } @@ -355,7 +352,7 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { id, tags, role.zone, - role.capacity_string(), + role.capacity_string() )); }; } @@ -372,7 +369,7 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { - let has_changes = layout.staging_parameters.get().clone() != layout.parameters; + let has_changes = *layout.staging_parameters.get() != layout.parameters; if has_changes { println!(); println!("==== NEW LAYOUT PARAMETERS ===="); diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 2f4dc129..133e33c8 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -100,16 +100,7 @@ impl NodeRole { } pub fn tags_string(&self) -> String { - let mut tags = String::new(); - if self.tags.is_empty() { - return tags; - } - tags.push_str(&self.tags[0].clone()); - for t in 1..self.tags.len() { - tags.push(','); - tags.push_str(&self.tags[t].clone()); - } - tags + self.tags.join(",") } } @@ -241,7 +232,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub fn nongateway_nodes(&self) -> Vec { + fn nongateway_nodes(&self) -> Vec { let mut result = Vec::::new(); for uuid in self.node_id_vec.iter() { match self.node_role(uuid) { @@ -253,7 +244,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Given a node uuids, this function returns the label of its zone - pub fn get_node_zone(&self, uuid: &Uuid) -> Result { + fn get_node_zone(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { Some(role) => Ok(role.zone.clone()), _ => Err(Error::Message( @@ -299,7 +290,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } /// Returns the sum of capacities of non gateway nodes in the cluster - pub fn get_total_capacity(&self) -> Result { + fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes().iter() { total_capacity += self.get_node_capacity(uuid)?; @@ -494,8 +485,7 @@ impl ClusterLayout { if partition_size < 100 { msg.push( - "WARNING: The partition size is low (< 100), you might consider to \ - provide the nodes capacities in a smaller unit (e.g. Mb instead of Gb)." + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" .into(), ); } @@ -533,7 +523,7 @@ impl ClusterLayout { // (1) We compute the new node list // Non gateway nodes should be coded on 8bits, hence they must be first in the list // We build the new node ids - let mut new_non_gateway_nodes: Vec = self + let new_non_gateway_nodes: Vec = self .roles .items() .iter() @@ -549,7 +539,7 @@ impl ClusterLayout { ))); } - let mut new_gateway_nodes: Vec = self + let new_gateway_nodes: Vec = self .roles .items() .iter() @@ -558,8 +548,8 @@ impl ClusterLayout { .collect(); let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.append(&mut new_non_gateway_nodes); - new_node_id_vec.append(&mut new_gateway_nodes); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); let old_node_id_vec = self.node_id_vec.clone(); self.node_id_vec = new_node_id_vec.clone(); @@ -567,12 +557,11 @@ impl ClusterLayout { // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition // to node assignations where the node is still in use. - let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; - if self.ring_assignation_data.is_empty() { // This is a new association return Ok(None); } + if self.ring_assignation_data.len() != NB_PARTITIONS * self.replication_factor { return Err(Error::Message( "The old assignation does not have a size corresponding to \ @@ -590,7 +579,9 @@ impl ClusterLayout { uuid_to_new_id.insert(*uuid, i); } + let mut old_assignation = vec![Vec::::new(); NB_PARTITIONS]; let rf = self.replication_factor; + for (p, old_assign_p) in old_assignation.iter_mut().enumerate() { for old_id in &self.ring_assignation_data[p * rf..(p + 1) * rf] { let uuid = old_node_id_vec[*old_id as usize]; @@ -613,18 +604,10 @@ impl ClusterLayout { let mut zone_to_id = HashMap::::new(); for uuid in self.nongateway_nodes().iter() { - if self.roles.get(uuid) == None { - return Err(Error::Message( - "The uuid was not found in the node roles (this should \ - not happen, it might be a critical error)." - .into(), - )); - } - if let Some(r) = self.node_role(uuid) { - if !zone_to_id.contains_key(&r.zone) && r.capacity != None { - zone_to_id.insert(r.zone.clone(), id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); } } Ok((id_to_zone, zone_to_id)) @@ -639,11 +622,7 @@ impl ClusterLayout { let empty_set = HashSet::<(usize, usize)>::new(); let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; - if g.get_flow_value()? - < (NB_PARTITIONS * self.replication_factor) - .try_into() - .unwrap() - { + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { return Err(Error::Message( "The storage capacity of he cluster is to small. It is \ impossible to store partitions of size 1." @@ -656,11 +635,7 @@ impl ClusterLayout { while s_down + 1 < s_up { g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; g.compute_maximal_flow()?; - if g.get_flow_value()? - < (NB_PARTITIONS * self.replication_factor) - .try_into() - .unwrap() - { + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { s_up = (s_down + s_up) / 2; } else { s_down = (s_down + s_up) / 2; -- cgit v1.2.3