From c7d0ad0aa0e492b913c5dda8ff1a7ad5a579fb3a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 26 Jan 2023 15:30:36 +0100 Subject: Add local disk usage to exported prometheus metrics --- src/rpc/lib.rs | 6 +++--- src/rpc/system.rs | 37 ++++++++++++++++++++++++++++--------- src/rpc/system_metrics.rs | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 12 deletions(-) diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index a8cc0030..5aec92c0 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -3,6 +3,9 @@ #[macro_use] extern crate tracing; +mod metrics; +mod system_metrics; + #[cfg(feature = "consul-discovery")] mod consul; #[cfg(feature = "kubernetes-discovery")] @@ -13,9 +16,6 @@ pub mod replication_mode; pub mod ring; pub mod system; -mod metrics; pub mod rpc_helper; pub use rpc_helper::*; - -pub mod system_metrics; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a9e91e19..e0ced8cc 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; +use std::sync::atomic::Ordering; use std::sync::{Arc, RwLock}; use std::time::{Duration, Instant}; @@ -38,7 +39,6 @@ use crate::replication_mode::*; use crate::ring::*; use crate::rpc_helper::*; -#[cfg(feature = "metrics")] use crate::system_metrics::*; const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60); @@ -106,7 +106,7 @@ pub struct System { consul_discovery: Option, #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: Option, - #[cfg(feature = "metrics")] + metrics: SystemMetrics, replication_mode: ReplicationMode, @@ -281,12 +281,11 @@ impl System { } }; - let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); - local_status.update_disk_usage(&config.metadata_dir, &config.data_dir); - - #[cfg(feature = "metrics")] let metrics = SystemMetrics::new(replication_factor); + let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); + local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); + let ring = Ring::new(cluster_layout, replication_factor); let (update_ring, ring) = watch::channel(Arc::new(ring)); @@ -377,7 +376,6 @@ impl System { consul_discovery, #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: config.kubernetes_discovery.clone(), - #[cfg(feature = "metrics")] metrics, ring, @@ -601,7 +599,7 @@ impl System { new_si.cluster_layout_version = ring.layout.version; new_si.cluster_layout_staging_hash = ring.layout.staging_hash; - new_si.update_disk_usage(&self.metadata_dir, &self.data_dir); + new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); self.local_status.swap(Arc::new(new_si)); } @@ -892,7 +890,7 @@ impl NodeStatus { } } - fn update_disk_usage(&mut self, meta_dir: &Path, data_dir: &Path) { + fn update_disk_usage(&mut self, meta_dir: &Path, data_dir: &Path, metrics: &SystemMetrics) { use systemstat::{Platform, System}; let mounts = System::new().mounts().unwrap_or_default(); @@ -906,6 +904,27 @@ impl NodeStatus { self.meta_disk_avail = mount_avail(meta_dir); self.data_disk_avail = mount_avail(data_dir); + + if let Some((avail, total)) = self.meta_disk_avail { + metrics + .values + .meta_disk_avail + .store(avail, Ordering::Relaxed); + metrics + .values + .meta_disk_total + .store(total, Ordering::Relaxed); + } + if let Some((avail, total)) = self.data_disk_avail { + metrics + .values + .data_disk_avail + .store(avail, Ordering::Relaxed); + metrics + .values + .data_disk_total + .store(total, Ordering::Relaxed); + } } } diff --git a/src/rpc/system_metrics.rs b/src/rpc/system_metrics.rs index d96b67e4..83f5fa97 100644 --- a/src/rpc/system_metrics.rs +++ b/src/rpc/system_metrics.rs @@ -1,14 +1,31 @@ +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + use opentelemetry::{global, metrics::*, KeyValue}; /// TableMetrics reference all counter used for metrics pub struct SystemMetrics { pub(crate) _garage_build_info: ValueObserver, pub(crate) _replication_factor: ValueObserver, + pub(crate) _disk_avail: ValueObserver, + pub(crate) _disk_total: ValueObserver, + pub(crate) values: Arc, +} + +#[derive(Default)] +pub struct SystemMetricsValues { + pub(crate) data_disk_total: AtomicU64, + pub(crate) data_disk_avail: AtomicU64, + pub(crate) meta_disk_total: AtomicU64, + pub(crate) meta_disk_avail: AtomicU64, } impl SystemMetrics { pub fn new(replication_factor: usize) -> Self { let meter = global::meter("garage_system"); + let values = Arc::new(SystemMetricsValues::default()); + let values1 = values.clone(); + let values2 = values.clone(); Self { _garage_build_info: meter .u64_value_observer("garage_build_info", move |observer| { @@ -28,6 +45,33 @@ impl SystemMetrics { }) .with_description("Garage replication factor setting") .init(), + _disk_avail: meter + .u64_value_observer("garage_local_disk_avail", move |observer| { + match values1.data_disk_avail.load(Ordering::Relaxed) { + 0 => (), + x => observer.observe(x, &[KeyValue::new("volume", "data")]), + }; + match values1.meta_disk_avail.load(Ordering::Relaxed) { + 0 => (), + x => observer.observe(x, &[KeyValue::new("volume", "metadata")]), + }; + }) + .with_description("Garage available disk space on each node") + .init(), + _disk_total: meter + .u64_value_observer("garage_local_disk_total", move |observer| { + match values2.data_disk_total.load(Ordering::Relaxed) { + 0 => (), + x => observer.observe(x, &[KeyValue::new("volume", "data")]), + }; + match values2.meta_disk_total.load(Ordering::Relaxed) { + 0 => (), + x => observer.observe(x, &[KeyValue::new("volume", "metadata")]), + }; + }) + .with_description("Garage total disk space on each node") + .init(), + values, } } } -- cgit v1.2.3