1 files changed, 150 insertions, 90 deletions
diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs
index c2655e59..e02a180b 100644
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@@ -1,6 +1,7 @@
 use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::collections::HashSet;
+use std::fmt;
 
 use bytesize::ByteSize;
 use itertools::Itertools;
@@ -115,7 +116,16 @@ mod v09 {
 	/// algorithm. It is stored as a Crdt.
 	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
 	pub struct LayoutParameters {
-		pub zone_redundancy: usize,
+		pub zone_redundancy: ZoneRedundancy,
+	}
+
+	/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
+	/// of each partition on at least that number of different zones.
+	/// Otherwise, copies will be stored on the maximum possible number of zones.
+	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
+	pub enum ZoneRedundancy {
+		AtLeast(usize),
+		Maximum,
 	}
 
 	impl garage_util::migrate::Migrate for ClusterLayout {
@@ -125,7 +135,6 @@ mod v09 {
 
 		fn migrate(previous: Self::Previous) -> Self {
 			use itertools::Itertools;
-			use std::collections::HashSet;
 
 			// In the old layout, capacities are in an arbitrary unit,
 			// but in the new layout they are in bytes.
@@ -152,17 +161,10 @@ mod v09 {
 				.min()
 				.unwrap_or(0);
 
-			// Determine zone redundancy parameter
-			let zone_redundancy = std::cmp::min(
-				previous.replication_factor,
-				roles
-					.items()
-					.iter()
-					.filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str()))
-					.collect::<HashSet<&str>>()
-					.len(),
-			);
-			let parameters = LayoutParameters { zone_redundancy };
+			// By default, zone_redundancy is maximum possible value
+			let parameters = LayoutParameters {
+				zone_redundancy: ZoneRedundancy::Maximum,
+			};
 
 			let mut res = Self {
 				version: previous.version,
@@ -193,7 +195,7 @@ mod v09 {
 				..
 			})) = role
 			{
-				*cap = *cap * mul;
+				*cap *= mul;
 			}
 			new_roles.merge_raw(node, *ts, &role);
 		}
@@ -224,15 +226,39 @@ impl NodeRole {
 	}
 }
 
+impl fmt::Display for ZoneRedundancy {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		match self {
+			ZoneRedundancy::Maximum => write!(f, "maximum"),
+			ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
+		}
+	}
+}
+
+impl core::str::FromStr for ZoneRedundancy {
+	type Err = &'static str;
+	fn from_str(s: &str) -> Result<Self, Self::Err> {
+		match s {
+			"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
+			x => {
+				let v = x
+					.parse::<usize>()
+					.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
+				Ok(ZoneRedundancy::AtLeast(v))
+			}
+		}
+	}
+}
+
 // Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
 impl ClusterLayout {
 	pub fn new(replication_factor: usize) -> Self {
-		// We set the default zone redundancy to be equal to the replication factor,
-		// i.e. as strict as possible.
+		// We set the default zone redundancy to be Maximum, meaning that the maximum
+		// possible value will be used depending on the cluster topology
 		let parameters = LayoutParameters {
-			zone_redundancy: replication_factor,
+			zone_redundancy: ZoneRedundancy::Maximum,
 		};
-		let staging_parameters = Lww::<LayoutParameters>::new(parameters.clone());
+		let staging_parameters = Lww::<LayoutParameters>::new(parameters);
 
 		let empty_lwwmap = LwwMap::new();
 
@@ -296,7 +322,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 
 		self.roles.merge(&self.staging_roles);
 		self.roles.retain(|(_, _, v)| v.0.is_some());
-		self.parameters = self.staging_parameters.get().clone();
+		self.parameters = *self.staging_parameters.get();
 
 		self.staging_roles.clear();
 		self.staging_hash = self.calculate_staging_hash();
@@ -325,7 +351,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		}
 
 		self.staging_roles.clear();
-		self.staging_parameters.update(self.parameters.clone());
+		self.staging_parameters.update(self.parameters);
 		self.staging_hash = self.calculate_staging_hash();
 
 		self.version += 1;
@@ -356,7 +382,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		let mut result = Vec::<Uuid>::new();
 		for uuid in self.node_id_vec.iter() {
 			match self.node_role(uuid) {
-				Some(role) if role.capacity != None => result.push(*uuid),
+				Some(role) if role.capacity.is_some() => result.push(*uuid),
 				_ => (),
 			}
 		}
@@ -418,6 +444,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		Ok(total_capacity)
 	}
 
+	/// Returns the effective value of the zone_redundancy parameter
+	fn effective_zone_redundancy(&self) -> usize {
+		match self.parameters.zone_redundancy {
+			ZoneRedundancy::AtLeast(v) => v,
+			ZoneRedundancy::Maximum => {
+				let n_zones = self
+					.roles
+					.items()
+					.iter()
+					.filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str()))
+					.collect::<HashSet<&str>>()
+					.len();
+				std::cmp::min(n_zones, self.replication_factor)
+			}
+		}
+	}
+
 	/// Check a cluster layout for internal consistency
 	/// (assignment, roles, parameters, partition size)
 	/// returns true if consistent, false if error
@@ -471,6 +514,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		}
 
 		// Check that every partition is associated to distinct nodes
+		let zone_redundancy = self.effective_zone_redundancy();
 		let rf = self.replication_factor;
 		for p in 0..(1 << PARTITION_BITS) {
 			let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec();
@@ -485,11 +529,10 @@ To know the correct value of the new layout version, invoke `garage layout show`
 						.expect("Zone not found.")
 				})
 				.collect::<Vec<_>>();
-			let redundancy = self.parameters.zone_redundancy;
-			if zones_of_p.iter().unique().count() < redundancy {
+			if zones_of_p.iter().unique().count() < zone_redundancy {
 				return Err(format!(
 					"nodes of partition are in less than {} distinct zones",
-					redundancy
+					zone_redundancy
 				));
 			}
 		}
@@ -518,7 +561,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		// algorithm.
 		let cl2 = self.clone();
 		let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap();
-		match cl2.compute_optimal_partition_size(&zone_to_id) {
+		match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) {
 			Ok(s) if s != self.partition_size => {
 				return Err(format!(
 					"partition_size ({}) is different than optimal value ({})",
@@ -533,6 +576,8 @@ To know the correct value of the new layout version, invoke `garage layout show`
 	}
 }
 
+// ====================================================================================
+
 // Implementation of the ClusterLayout methods related to the assignment algorithm.
 impl ClusterLayout {
 	/// This function calculates a new partition-to-node assignment.
@@ -549,13 +594,15 @@ impl ClusterLayout {
 		// changes in the layout. We retrieve the old_assignment reframed with new ids
 		let old_assignment_opt = self.update_node_id_vec()?;
 
+		let zone_redundancy = self.effective_zone_redundancy();
+
 		let mut msg = Message::new();
 		msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into());
 		msg.push("".into());
 		msg.push(format!(
 			"Partitions are \
         replicated {} times on at least {} distinct zones.",
-			self.replication_factor, self.parameters.zone_redundancy
+			self.replication_factor, zone_redundancy
 		));
 
 		// We generate for once numerical ids for the zones of non gateway nodes,
@@ -570,12 +617,12 @@ impl ClusterLayout {
 				nb_nongateway_nodes, self.replication_factor
 			)));
 		}
-		if id_to_zone.len() < self.parameters.zone_redundancy {
+		if id_to_zone.len() < zone_redundancy {
 			return Err(Error::Message(format!(
 				"The number of zones with non-gateway \
             nodes ({}) is smaller than the redundancy parameter ({})",
 				id_to_zone.len(),
-				self.parameters.zone_redundancy
+				zone_redundancy
 			)));
 		}
 
@@ -583,18 +630,18 @@ impl ClusterLayout {
 		// Capacities should be given in a unit so that partition size is at least 100.
 		// In this case, integer rounding plays a marginal role in the percentages of
 		// optimality.
-		let partition_size = self.compute_optimal_partition_size(&zone_to_id)?;
+		let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?;
 
-		if old_assignment_opt != None {
+		msg.push("".into());
+		if old_assignment_opt.is_some() {
 			msg.push(format!(
-				"Optimal size of a partition: {} (was {} in the previous layout).",
+				"Optimal partition size:                     {} ({} in previous layout)",
 				ByteSize::b(partition_size).to_string_as(false),
 				ByteSize::b(self.partition_size).to_string_as(false)
 			));
 		} else {
 			msg.push(format!(
-				"Given the replication and redundancy constraints, the \
-                optimal size of a partition is {}.",
+				"Optimal partition size:                     {}",
 				ByteSize::b(partition_size).to_string_as(false)
 			));
 		}
@@ -610,7 +657,8 @@ impl ClusterLayout {
 
 		// We compute a first flow/assignment that is heuristically close to the previous
 		// assignment
-		let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?;
+		let mut gflow =
+			self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?;
 		if let Some(assoc) = &old_assignment_opt {
 			// We minimize the distance to the previous assignment.
 			self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?;
@@ -618,7 +666,6 @@ impl ClusterLayout {
 
 		// We display statistics of the computation
 		msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?);
-		msg.push("".to_string());
 
 		// We update the layout structure
 		self.update_ring_from_flow(id_to_zone.len(), &gflow)?;
@@ -645,7 +692,7 @@ impl ClusterLayout {
 			.roles
 			.items()
 			.iter()
-			.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None))
+			.filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some()))
 			.map(|(k, _, _)| *k)
 			.collect();
 
@@ -661,7 +708,7 @@ impl ClusterLayout {
 			.roles
 			.items()
 			.iter()
-			.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None))
+			.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none()))
 			.map(|(k, _, _)| *k)
 			.collect();
 
@@ -723,7 +770,7 @@ impl ClusterLayout {
 
 		for uuid in self.nongateway_nodes().iter() {
 			let r = self.node_role(uuid).unwrap();
-			if !zone_to_id.contains_key(&r.zone) && r.capacity != None {
+			if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() {
 				zone_to_id.insert(r.zone.clone(), id_to_zone.len());
 				id_to_zone.push(r.zone.clone());
 			}
@@ -736,9 +783,10 @@ impl ClusterLayout {
 	fn compute_optimal_partition_size(
 		&self,
 		zone_to_id: &HashMap<String, usize>,
+		zone_redundancy: usize,
 	) -> Result<u64, Error> {
 		let empty_set = HashSet::<(usize, usize)>::new();
-		let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?;
+		let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?;
 		g.compute_maximal_flow()?;
 		if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
 			return Err(Error::Message(
@@ -751,7 +799,12 @@ impl ClusterLayout {
 		let mut s_down = 1;
 		let mut s_up = self.get_total_capacity()?;
 		while s_down + 1 < s_up {
-			g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?;
+			g = self.generate_flow_graph(
+				(s_down + s_up) / 2,
+				zone_to_id,
+				&empty_set,
+				zone_redundancy,
+			)?;
 			g.compute_maximal_flow()?;
 			if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
 				s_up = (s_down + s_up) / 2;
@@ -790,18 +843,18 @@ impl ClusterLayout {
 		partition_size: u64,
 		zone_to_id: &HashMap<String, usize>,
 		exclude_assoc: &HashSet<(usize, usize)>,
+		zone_redundancy: usize,
 	) -> Result<Graph<FlowEdge>, Error> {
 		let vertices =
 			ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
 		let mut g = Graph::<FlowEdge>::new(&vertices);
 		let nb_zones = zone_to_id.len();
-		let redundancy = self.parameters.zone_redundancy;
 		for p in 0..NB_PARTITIONS {
-			g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?;
+			g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?;
 			g.add_edge(
 				Vertex::Source,
 				Vertex::Pdown(p),
-				(self.replication_factor - redundancy) as u64,
+				(self.replication_factor - zone_redundancy) as u64,
 			)?;
 			for z in 0..nb_zones {
 				g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?;
@@ -830,6 +883,7 @@ impl ClusterLayout {
 		&self,
 		zone_to_id: &HashMap<String, usize>,
 		prev_assign_opt: &Option<Vec<Vec<usize>>>,
+		zone_redundancy: usize,
 	) -> Result<Graph<FlowEdge>, Error> {
 		// We list the (partition,node) associations that are not used in the
 		// previous assignment
@@ -847,7 +901,12 @@ impl ClusterLayout {
 		}
 
 		// We compute the best flow using only the edges used in the previous assignment
-		let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?;
+		let mut g = self.generate_flow_graph(
+			self.partition_size,
+			zone_to_id,
+			&exclude_edge,
+			zone_redundancy,
+		)?;
 		g.compute_maximal_flow()?;
 
 		// We add the excluded edges and compute the maximal flow with the full graph.
@@ -931,29 +990,33 @@ impl ClusterLayout {
 		let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
 		let total_cap = self.get_total_capacity()?;
 		let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
-		msg.push("".into());
 		msg.push(format!(
-			"Usable capacity / Total cluster capacity: {} / {} ({:.1} %)",
+			"Usable capacity / total cluster capacity:   {} / {} ({:.1} %)",
 			ByteSize::b(used_cap).to_string_as(false),
 			ByteSize::b(total_cap).to_string_as(false),
 			percent_cap
 		));
-		msg.push("".into());
-		msg.push(
-			"If the percentage is too low, it might be that the \
-        replication/redundancy constraints force the use of nodes/zones with small \
-        storage capacities. \
-        You might want to rebalance the storage capacities or relax the constraints. \
-        See the detailed statistics below and look for saturated nodes/zones."
-				.into(),
-		);
 		msg.push(format!(
-			"Recall that because of the replication factor, the actual available \
-                         storage capacity is {} / {} = {}.",
-			ByteSize::b(used_cap).to_string_as(false),
+			"Effective capacity (replication factor {}):  {}",
 			self.replication_factor,
 			ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false)
 		));
+		if percent_cap < 80. {
+			msg.push("".into());
+			msg.push(
+				"If the percentage is too low, it might be that the \
+            cluster topology and redundancy constraints are forcing the use of nodes/zones with small \
+            storage capacities."
+					.into(),
+			);
+			msg.push(
+				"You might want to move storage capacity between zones or relax the redundancy constraint."
+					.into(),
+			);
+			msg.push(
+				"See the detailed statistics below and look for saturated nodes/zones.".into(),
+			);
+		}
 
 		// We define and fill in the following tables
 		let storing_nodes = self.nongateway_nodes();
@@ -992,25 +1055,25 @@ impl ClusterLayout {
 			}
 		}
 
-		if *prev_assign_opt == None {
+		if prev_assign_opt.is_none() {
 			new_partitions = stored_partitions.clone();
-			new_partitions_zone = stored_partitions_zone.clone();
+			//new_partitions_zone = stored_partitions_zone.clone();
 		}
 
 		// We display the statistics
 
 		msg.push("".into());
-		if *prev_assign_opt != None {
+		if prev_assign_opt.is_some() {
 			let total_new_partitions: usize = new_partitions.iter().sum();
 			msg.push(format!(
 				"A total of {} new copies of partitions need to be \
                              transferred.",
 				total_new_partitions
 			));
+			msg.push("".into());
 		}
-		msg.push("".into());
-		msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into());
 
+		let mut table = vec![];
 		for z in 0..id_to_zone.len() {
 			let mut nodes_of_z = Vec::<usize>::new();
 			for n in 0..storing_nodes.len() {
@@ -1020,15 +1083,9 @@ impl ClusterLayout {
 			}
 			let replicated_partitions: usize =
 				nodes_of_z.iter().map(|n| stored_partitions[*n]).sum();
-			msg.push("".into());
-
-			msg.push(format!(
-				"Zone {}: {} distinct partitions stored ({} new, \
-                {} partition copies) ",
-				id_to_zone[z],
-				stored_partitions_zone[z],
-				new_partitions_zone[z],
-				replicated_partitions
+			table.push(format!(
+				"{}\tTags\tPartitions\tCapacity\tUsable capacity",
+				id_to_zone[z]
 			));
 
 			let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
@@ -1037,33 +1094,35 @@ impl ClusterLayout {
 				total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
 			}
 			let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);
-			msg.push(format!(
-				"  Usable capacity / Total capacity: {} / {} ({:.1}%).",
-				ByteSize::b(available_cap_z).to_string_as(false),
-				ByteSize::b(total_cap_z).to_string_as(false),
-				percent_cap_z
-			));
 
 			for n in nodes_of_z.iter() {
 				let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
 				let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
-				let tags_n = (self
-					.node_role(&self.node_id_vec[*n])
-					.ok_or("Node not found."))?
-				.tags_string();
-				msg.push(format!(
-					"  Node {:?}: {} partitions ({} new) ; \
-                                 usable/total capacity: {} / {} ({:.1}%) ; tags:{}",
+				let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
+				table.push(format!(
+					"  {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
 					self.node_id_vec[*n],
+					tags_n,
 					stored_partitions[*n],
 					new_partitions[*n],
-					ByteSize::b(available_cap_n).to_string_as(false),
 					ByteSize::b(total_cap_n).to_string_as(false),
+					ByteSize::b(available_cap_n).to_string_as(false),
 					(available_cap_n as f32) / (total_cap_n as f32) * 100.0,
-					tags_n
 				));
 			}
+
+			table.push(format!(
+				"  TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)",
+				replicated_partitions,
+				stored_partitions_zone[z],
+				//new_partitions_zone[z],
+				ByteSize::b(total_cap_z).to_string_as(false),
+				ByteSize::b(available_cap_z).to_string_as(false),
+				percent_cap_z
+			));
+			table.push("".into());
 		}
+		msg.push(format_table::format_table_to_string(table));
 
 		Ok(msg)
 	}
@@ -1125,7 +1184,7 @@ mod tests {
 
 		let mut curr_zone = 0;
 
-		let redundancy = cl.parameters.zone_redundancy;
+		let redundancy = cl.effective_zone_redundancy();
 
 		for replic in 0..cl.replication_factor {
 			for p in 0..NB_PARTITIONS {
@@ -1177,8 +1236,9 @@ mod tests {
 			);
 			cl.staging_roles.merge(&update);
 		}
-		cl.staging_parameters
-			.update(LayoutParameters { zone_redundancy });
+		cl.staging_parameters.update(LayoutParameters {
+			zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
+		});
 		cl.staging_hash = cl.calculate_staging_hash();
 	}