From ab722cb40f5aacf661a280b7eb025acd3aefc1bb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 16:22:23 +0200 Subject: Add checks on replication_factor of layouts we use (fix #363, fix #364) --- src/model/garage.rs | 2 +- src/rpc/system.rs | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/model/garage.rs b/src/model/garage.rs index ec1ec956..75012952 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -169,7 +169,7 @@ impl Garage { background.clone(), replication_mode.replication_factor(), &config, - ); + )?; let data_rep_param = TableShardedReplication { system: system.clone(), diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c0e70c61..228b66a4 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -198,7 +198,7 @@ impl System { background: Arc, replication_factor: usize, config: &Config, - ) -> Arc { + ) -> Result, Error> { let node_key = gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); info!( @@ -206,11 +206,21 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout"); + let persist_cluster_layout: Persister = + Persister::new(&config.metadata_dir, "cluster_layout"); let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); let cluster_layout = match persist_cluster_layout.load() { - Ok(x) => x, + Ok(x) => { + if x.replication_factor != replication_factor { + return Err(Error::Message(format!( + "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", + x.replication_factor, + replication_factor + ))); + } + x + } Err(e) => { info!( "No valid previous cluster layout stored ({}), starting fresh.", @@ -303,7 +313,7 @@ impl System { metadata_dir: config.metadata_dir.clone(), }); sys.system_endpoint.set_handler(sys.clone()); - sys + Ok(sys) } /// Perform bootstraping, starting the ping loop @@ -485,7 +495,7 @@ impl System { let local_info = self.local_status.load(); if local_info.replication_factor < info.replication_factor { - error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and might lead to bugs", + error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety.", info.replication_factor, local_info.replication_factor); std::process::exit(1); @@ -513,6 +523,16 @@ impl System { self: &Arc, adv: &ClusterLayout, ) -> Result { + if adv.replication_factor != self.replication_factor { + let msg = format!( + "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", + adv.replication_factor, + self.replication_factor + ); + error!("{}", msg); + return Err(Error::Message(msg)); + } + let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3