From 7f3249a23770fd4da981c2ecb1126da97e9b4ca5 Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 21 Sep 2022 14:39:59 +0200 Subject: New version of the algorithm that calculate the layout. It takes as paramters the replication factor and the zone redundancy, computes the largest partition size reachable with these constraints, and among the possible assignation with this partition size, it computes the one that moves the least number of partitions compared to the previous assignation. This computation uses graph algorithms defined in graph_algo.rs --- src/rpc/system.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 68d94ea5..313671ca 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -97,6 +97,7 @@ pub struct System { kubernetes_discovery: Option, replication_factor: usize, + zone_redundancy: usize, /// The ring pub ring: watch::Receiver>, @@ -192,6 +193,7 @@ impl System { network_key: NetworkKey, background: Arc, replication_factor: usize, + zone_redundancy: usize, config: &Config, ) -> Arc { let node_key = @@ -211,7 +213,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor) + ClusterLayout::new(replication_factor, zone_redundancy) } }; @@ -285,6 +287,7 @@ impl System { rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), system_endpoint, replication_factor, + zone_redundancy, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, bootstrap_peers: config.bootstrap_peers.clone(), -- cgit v1.2.3 From 99f96b9564c9c841dc6c56f1255a6e70ff884d46 Mon Sep 17 00:00:00 2001 From: Mendes Date: Tue, 4 Oct 2022 18:09:24 +0200 Subject: deleted zone_redundancy from System struct --- src/rpc/system.rs | 2 -- 1 file changed, 2 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 313671ca..34031b10 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -97,7 +97,6 @@ pub struct System { kubernetes_discovery: Option, replication_factor: usize, - zone_redundancy: usize, /// The ring pub ring: watch::Receiver>, @@ -287,7 +286,6 @@ impl System { rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), system_endpoint, replication_factor, - zone_redundancy, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, bootstrap_peers: config.bootstrap_peers.clone(), -- cgit v1.2.3 From ceac3713d6639f9170fc3b4475fae4a30b34483c Mon Sep 17 00:00:00 2001 From: Mendes Date: Wed, 5 Oct 2022 15:29:48 +0200 Subject: modifications in several files to : - have consistent error return types - store the zone redundancy in a Lww - print the error and message in the CLI (TODO: for the server Api, should msg be returned in the body response?) --- src/rpc/system.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 7eb25195..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -196,7 +196,6 @@ impl System { network_key: NetworkKey, background: Arc, replication_factor: usize, - zone_redundancy: usize, config: &Config, ) -> Result, Error> { let node_key = @@ -226,7 +225,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor, zone_redundancy) + ClusterLayout::new(replication_factor) } }; -- cgit v1.2.3 From 9407df60cc00fc70c10f73bc4b600085789d5353 Mon Sep 17 00:00:00 2001 From: Mendes Date: Thu, 6 Oct 2022 12:54:51 +0200 Subject: Corrected two bugs: - self.node_id_vec was not properly updated when the previous ring was empty - ClusterLayout::merge was not considering changes in the layout parameters --- src/rpc/system.rs | 1 + 1 file changed, 1 insertion(+) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 9e0bfa11..655d21de 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,6 +565,7 @@ impl System { return Err(Error::Message(msg)); } + let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From 4abab246f1113a9a1988fdfca81c1dd8ffa323c8 Mon Sep 17 00:00:00 2001 From: Mendes Date: Mon, 10 Oct 2022 17:21:13 +0200 Subject: cargo fmt --- src/rpc/system.rs | 1 - 1 file changed, 1 deletion(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 655d21de..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,7 +565,6 @@ impl System { return Err(Error::Message(msg)); } - let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From d75b37b018fc0ce8e3832c8531d9556ff7a345c9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 8 Nov 2022 14:23:08 +0100 Subject: Return more info when layout's .check() fails, fix compilation, fix test --- src/rpc/system.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index d6576f20..224fbabb 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -565,9 +565,9 @@ impl System { let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); - let prev_layout_check = layout.check(); + let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { - if prev_layout_check && !layout.check() { + if prev_layout_check && !layout.check().is_ok() { error!("New cluster layout is invalid, discarding."); return Err(Error::Message( "New cluster layout is invalid, discarding.".into(), @@ -620,7 +620,7 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = !self.ring.borrow().layout.check(); + let not_configured = !self.ring.borrow().layout.check().is_ok(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self -- cgit v1.2.3 From 35c108b85d2b70ad28cd93bfd412607a89b9acf9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 14 Jun 2023 13:53:19 +0200 Subject: admin api: switch GetClusterHealth to camelcase (fix #381 again) --- src/rpc/system.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c549d8fc..1675e70e 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -151,7 +151,7 @@ pub struct KnownNodeInfo { pub status: NodeStatus, } -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy)] pub struct ClusterHealth { /// The current health status of the cluster (see below) pub status: ClusterHealthStatus, @@ -171,7 +171,7 @@ pub struct ClusterHealth { pub partitions_all_ok: usize, } -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy)] pub enum ClusterHealthStatus { /// All nodes are available Healthy, -- cgit v1.2.3 From 71c0188055e25aa1c00d0226f0ca99ce323310a6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 4 Sep 2023 14:49:49 +0200 Subject: block manager: skeleton for multi-hdd support --- src/rpc/system.rs | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 1675e70e..c5751d5d 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -22,9 +22,9 @@ use netapp::peering::fullmesh::FullMeshPeeringStrategy; use netapp::util::parse_and_resolve_peer_addr_async; use netapp::{NetApp, NetworkKey, NodeID, NodeKey}; -use garage_util::config::Config; #[cfg(feature = "kubernetes-discovery")] use garage_util::config::KubernetesDiscoveryConfig; +use garage_util::config::{Config, DataDirEnum}; use garage_util::data::*; use garage_util::error::*; use garage_util::persister::Persister; @@ -119,7 +119,7 @@ pub struct System { /// Path to metadata directory pub metadata_dir: PathBuf, /// Path to data directory - pub data_dir: PathBuf, + pub data_dir: DataDirEnum, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -890,7 +890,12 @@ impl NodeStatus { } } - fn update_disk_usage(&mut self, meta_dir: &Path, data_dir: &Path, metrics: &SystemMetrics) { + fn update_disk_usage( + &mut self, + meta_dir: &Path, + data_dir: &DataDirEnum, + metrics: &SystemMetrics, + ) { use systemstat::{Platform, System}; let mounts = System::new().mounts().unwrap_or_default(); @@ -903,7 +908,17 @@ impl NodeStatus { }; self.meta_disk_avail = mount_avail(meta_dir); - self.data_disk_avail = mount_avail(data_dir); + self.data_disk_avail = match data_dir { + DataDirEnum::Single(dir) => mount_avail(dir), + DataDirEnum::Multiple(dirs) => { + dirs.iter() + .map(|d| mount_avail(&d.path)) + .fold(Some((0, 0)), |acc, cur| match (acc, cur) { + (Some((x, y)), Some((a, b))) => Some((x + a, y + b)), + _ => None, + }) + } + }; if let Some((avail, total)) = self.meta_disk_avail { metrics -- cgit v1.2.3 From 2f112ac6827d24f5e8c87915a31a86ec721ebf9e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Sep 2023 14:42:20 +0200 Subject: correct free data space accounting for multiple data dirs on same fs --- src/rpc/system.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c5751d5d..cf480549 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -911,12 +911,30 @@ impl NodeStatus { self.data_disk_avail = match data_dir { DataDirEnum::Single(dir) => mount_avail(dir), DataDirEnum::Multiple(dirs) => { - dirs.iter() - .map(|d| mount_avail(&d.path)) - .fold(Some((0, 0)), |acc, cur| match (acc, cur) { - (Some((x, y)), Some((a, b))) => Some((x + a, y + b)), - _ => None, + // Take mounts corresponding to all specified data directories that + // can be used for writing data + let mounts = dirs + .iter() + .filter(|dir| dir.capacity.is_some()) + .map(|dir| { + mounts + .iter() + .filter(|mnt| dir.path.starts_with(&mnt.fs_mounted_on)) + .max_by_key(|mnt| mnt.fs_mounted_on.len()) }) + .collect::>(); + if mounts.iter().any(|x| x.is_none()) { + None // could not get info for at least one mount + } else { + // dedup mounts in case several data directories are on the same filesystem + let mut mounts = mounts.iter().map(|x| x.unwrap()).collect::>(); + mounts.sort_by(|x, y| x.fs_mounted_on.cmp(&y.fs_mounted_on)); + mounts.dedup_by(|x, y| x.fs_mounted_on == y.fs_mounted_on); + // calculate sum of available and total space + Some(mounts.iter().fold((0, 0), |(x, y), mnt| { + (x + mnt.avail.as_u64(), y + mnt.total.as_u64()) + })) + } } }; -- cgit v1.2.3 From 0088599f52f38ae9e00fe772a416150813e2470b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Sep 2023 12:17:07 +0200 Subject: new layout: fix clippy lints --- src/rpc/system.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/rpc/system.rs') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 8fba9580..7fc3c20c 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -668,7 +668,7 @@ impl System { let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { - if prev_layout_check && !layout.check().is_ok() { + if prev_layout_check && layout.check().is_err() { error!("New cluster layout is invalid, discarding."); return Err(Error::Message( "New cluster layout is invalid, discarding.".into(), @@ -724,7 +724,7 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = !self.ring.borrow().layout.check().is_ok(); + let not_configured = self.ring.borrow().layout.check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self -- cgit v1.2.3