aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-02-10 16:10:21 +0100
committerAlex Auvolat <alex@adnab.me>2022-03-24 15:27:10 +0100
commit413ab0eaedb12b8808897098263cefa4bc7a7663 (patch)
tree2a9a47cf07d4c69ee7ab4b257ce9de944a43e211
parent43945234ae15734def2a9d29ee0880a7156c25a6 (diff)
downloadgarage-413ab0eaedb12b8808897098263cefa4bc7a7663.tar.gz
garage-413ab0eaedb12b8808897098263cefa4bc7a7663.zip
Small change to partition assignation algorithm
This change helps ensure that nodes for each partition are spread over all datacenters, a property that wasn't ensured previously when going from a 2 DC deployment to a 3 DC deployment
-rw-r--r--src/garage/cli/layout.rs18
-rw-r--r--src/rpc/layout.rs28
2 files changed, 37 insertions, 9 deletions
diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs
index 0d9e4fa4..e0aba1d1 100644
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@@ -196,6 +196,15 @@ pub async fn cmd_apply_layout(
) -> Result<(), Error> {
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
+ layout.roles.merge(&layout.staging);
+
+ if !layout.calculate_partition_assignation() {
+ return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
+ }
+
+ layout.staging.clear();
+ layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
+
match apply_opt.version {
None => {
println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
@@ -209,15 +218,6 @@ pub async fn cmd_apply_layout(
}
}
- layout.roles.merge(&layout.staging);
-
- if !layout.calculate_partition_assignation() {
- return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
- }
-
- layout.staging.clear();
- layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
-
layout.version += 1;
send_layout(rpc_cli, rpc_host, layout).await?;
diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs
index 895dbf1c..a24bd9f3 100644
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@@ -172,12 +172,38 @@ impl ClusterLayout {
println!("Calculating updated partition assignation, this may take some time...");
println!();
+ // Get old partition assignation
let old_partitions = self.parse_assignation_data();
+ // Create new partition assignation starting from old one
let mut partitions = old_partitions.clone();
+
+ // Cleanup steps in new partition assignation:
+ let min_keep_nodes_per_part = (self.replication_factor + 1) / 2;
for part in partitions.iter_mut() {
+ // - remove from assignation nodes that don't have a role in the layout anymore
part.nodes
.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
+
+ // - remove from assignation some nodes that are in the same datacenter
+ // if we can, so that the later steps can ensure datacenter variety
+ // as much as possible (but still under the constraint that each partition
+ // should not move from at least a certain number of nodes that is
+ // min_keep_nodes_per_part)
+ 'rmloop: while part.nodes.len() > min_keep_nodes_per_part {
+ let mut zns_c = HashMap::<&str, usize>::new();
+ for (_id, info) in part.nodes.iter() {
+ *zns_c.entry(info.unwrap().zone.as_str()).or_insert(0) += 1;
+ }
+ for i in 0..part.nodes.len() {
+ if zns_c[part.nodes[i].1.unwrap().zone.as_str()] > 1 {
+ part.nodes.remove(i);
+ continue 'rmloop;
+ }
+ }
+
+ break;
+ }
}
// When nodes are removed, or when bootstraping an assignation from
@@ -196,6 +222,8 @@ impl ClusterLayout {
}
}
None => {
+ // Not enough nodes in cluster to build a correct assignation.
+ // Signal it by returning an error.
return false;
}
}