diff options
author | Alex Auvolat <alex@adnab.me> | 2022-02-10 16:10:21 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2022-03-24 15:27:10 +0100 |
commit | 413ab0eaedb12b8808897098263cefa4bc7a7663 (patch) | |
tree | 2a9a47cf07d4c69ee7ab4b257ce9de944a43e211 | |
parent | 43945234ae15734def2a9d29ee0880a7156c25a6 (diff) | |
download | garage-413ab0eaedb12b8808897098263cefa4bc7a7663.tar.gz garage-413ab0eaedb12b8808897098263cefa4bc7a7663.zip |
Small change to partition assignation algorithm
This change helps ensure that nodes for each partition are spread
over all datacenters, a property that wasn't ensured previously
when going from a 2 DC deployment to a 3 DC deployment
-rw-r--r-- | src/garage/cli/layout.rs | 18 | ||||
-rw-r--r-- | src/rpc/layout.rs | 28 |
2 files changed, 37 insertions, 9 deletions
diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0d9e4fa4..e0aba1d1 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -196,6 +196,15 @@ pub async fn cmd_apply_layout( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + layout.roles.merge(&layout.staging); + + if !layout.calculate_partition_assignation() { + return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); + } + + layout.staging.clear(); + layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]); + match apply_opt.version { None => { println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout."); @@ -209,15 +218,6 @@ pub async fn cmd_apply_layout( } } - layout.roles.merge(&layout.staging); - - if !layout.calculate_partition_assignation() { - return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); - } - - layout.staging.clear(); - layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]); - layout.version += 1; send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 895dbf1c..a24bd9f3 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -172,12 +172,38 @@ impl ClusterLayout { println!("Calculating updated partition assignation, this may take some time..."); println!(); + // Get old partition assignation let old_partitions = self.parse_assignation_data(); + // Create new partition assignation starting from old one let mut partitions = old_partitions.clone(); + + // Cleanup steps in new partition assignation: + let min_keep_nodes_per_part = (self.replication_factor + 1) / 2; for part in partitions.iter_mut() { + // - remove from assignation nodes that don't have a role in the layout anymore part.nodes .retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false)); + + // - remove from assignation some nodes that are in the same datacenter + // if we can, so that the later steps can ensure datacenter variety + // as much as possible (but still under the constraint that each partition + // should not move from at least a certain number of nodes that is + // min_keep_nodes_per_part) + 'rmloop: while part.nodes.len() > min_keep_nodes_per_part { + let mut zns_c = HashMap::<&str, usize>::new(); + for (_id, info) in part.nodes.iter() { + *zns_c.entry(info.unwrap().zone.as_str()).or_insert(0) += 1; + } + for i in 0..part.nodes.len() { + if zns_c[part.nodes[i].1.unwrap().zone.as_str()] > 1 { + part.nodes.remove(i); + continue 'rmloop; + } + } + + break; + } } // When nodes are removed, or when bootstraping an assignation from @@ -196,6 +222,8 @@ impl ClusterLayout { } } None => { + // Not enough nodes in cluster to build a correct assignation. + // Signal it by returning an error. return false; } } |