From 2065f011ca3f7c736feecffd108c89d3f8019e85 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 5 Dec 2022 14:59:15 +0100 Subject: Implement /health admin API endpoint to check node health --- src/api/admin/api_server.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++ src/api/admin/router.rs | 4 ++ 2 files changed, 98 insertions(+) (limited to 'src/api/admin') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index 2896d058..9beeda1f 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use std::fmt::Write; use std::net::SocketAddr; use std::sync::Arc; @@ -15,6 +17,8 @@ use opentelemetry_prometheus::PrometheusExporter; use prometheus::{Encoder, TextEncoder}; use garage_model::garage::Garage; +use garage_rpc::layout::NodeRoleV; +use garage_util::data::Uuid; use garage_util::error::Error as GarageError; use crate::generic_server::*; @@ -76,6 +80,94 @@ impl AdminApiServer { .body(Body::empty())?) } + fn handle_health(&self) -> Result, Error> { + let ring: Arc<_> = self.garage.system.ring.borrow().clone(); + let quorum = self.garage.replication_mode.write_quorum(); + let replication_factor = self.garage.replication_mode.replication_factor(); + + let nodes = self + .garage + .system + .get_known_nodes() + .into_iter() + .map(|n| (n.id, n)) + .collect::>(); + let n_nodes_connected = nodes.iter().filter(|(_, n)| n.is_up).count(); + + let storage_nodes = ring + .layout + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) + .collect::>(); + let n_storage_nodes_ok = storage_nodes + .iter() + .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) + .count(); + + let partitions = ring.partitions(); + let partitions_n_up = partitions + .iter() + .map(|(_, h)| { + let pn = ring.get_nodes(h, ring.replication_factor); + pn.iter() + .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) + .count() + }) + .collect::>(); + let n_partitions_full_ok = partitions_n_up + .iter() + .filter(|c| **c == replication_factor) + .count(); + let n_partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count(); + + let (status, status_str) = if n_partitions_quorum == partitions.len() + && n_storage_nodes_ok == storage_nodes.len() + { + (StatusCode::OK, "Garage is fully operational") + } else if n_partitions_quorum == partitions.len() { + ( + StatusCode::OK, + "Garage is operational but some storage nodes are unavailable", + ) + } else { + ( + StatusCode::SERVICE_UNAVAILABLE, + "Quorum is not available for some/all partitions, reads and writes will fail", + ) + }; + + let mut buf = status_str.to_string(); + writeln!( + &mut buf, + "\nAll nodes: {} connected, {} known", + n_nodes_connected, + nodes.len() + ) + .unwrap(); + writeln!( + &mut buf, + "Storage nodes: {} connected, {} in layout", + n_storage_nodes_ok, + storage_nodes.len() + ) + .unwrap(); + writeln!(&mut buf, "Number of partitions: {}", partitions.len()).unwrap(); + writeln!(&mut buf, "Partitions with quorum: {}", n_partitions_quorum).unwrap(); + writeln!( + &mut buf, + "Partitions with all nodes available: {}", + n_partitions_full_ok + ) + .unwrap(); + + Ok(Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "text/plain") + .body(Body::from(buf))?) + } + fn handle_metrics(&self) -> Result, Error> { #[cfg(feature = "metrics")] { @@ -124,6 +216,7 @@ impl ApiHandler for AdminApiServer { ) -> Result, Error> { let expected_auth_header = match endpoint.authorization_type() { + Authorization::None => None, Authorization::MetricsToken => self.metrics_token.as_ref(), Authorization::AdminToken => match &self.admin_token { None => return Err(Error::forbidden( @@ -147,6 +240,7 @@ impl ApiHandler for AdminApiServer { match endpoint { Endpoint::Options => self.handle_options(&req), + Endpoint::Health => self.handle_health(), Endpoint::Metrics => self.handle_metrics(), Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await, Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await, diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs index 3eee8b67..14411f75 100644 --- a/src/api/admin/router.rs +++ b/src/api/admin/router.rs @@ -6,6 +6,7 @@ use crate::admin::error::*; use crate::router_macros::*; pub enum Authorization { + None, MetricsToken, AdminToken, } @@ -16,6 +17,7 @@ router_match! {@func #[derive(Debug, Clone, PartialEq, Eq)] pub enum Endpoint { Options, + Health, Metrics, GetClusterStatus, ConnectClusterNodes, @@ -88,6 +90,7 @@ impl Endpoint { let res = router_match!(@gen_path_parser (req.method(), path, query) [ OPTIONS _ => Options, + GET "/health" => Health, GET "/metrics" => Metrics, GET "/v0/status" => GetClusterStatus, POST "/v0/connect" => ConnectClusterNodes, @@ -130,6 +133,7 @@ impl Endpoint { /// Get the kind of authorization which is required to perform the operation. pub fn authorization_type(&self) -> Authorization { match self { + Self::Health => Authorization::None, Self::Metrics => Authorization::MetricsToken, _ => Authorization::AdminToken, } -- cgit v1.2.3 From 280d1be7b1fde13d23e47f75aa8acd2f90efb81f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 5 Dec 2022 15:28:57 +0100 Subject: Refactor health check and add ability to return it in json --- src/api/admin/api_server.rs | 131 +++++++++++++++++--------------------------- src/api/admin/router.rs | 9 ++- 2 files changed, 55 insertions(+), 85 deletions(-) (limited to 'src/api/admin') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index 9beeda1f..f86ed599 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::fmt::Write; use std::net::SocketAddr; use std::sync::Arc; @@ -17,8 +16,7 @@ use opentelemetry_prometheus::PrometheusExporter; use prometheus::{Encoder, TextEncoder}; use garage_model::garage::Garage; -use garage_rpc::layout::NodeRoleV; -use garage_util::data::Uuid; +use garage_rpc::system::ClusterHealthStatus; use garage_util::error::Error as GarageError; use crate::generic_server::*; @@ -80,92 +78,61 @@ impl AdminApiServer { .body(Body::empty())?) } - fn handle_health(&self) -> Result, Error> { - let ring: Arc<_> = self.garage.system.ring.borrow().clone(); - let quorum = self.garage.replication_mode.write_quorum(); - let replication_factor = self.garage.replication_mode.replication_factor(); + fn handle_health(&self, format: Option<&str>) -> Result, Error> { + let health = self.garage.system.health(); - let nodes = self - .garage - .system - .get_known_nodes() - .into_iter() - .map(|n| (n.id, n)) - .collect::>(); - let n_nodes_connected = nodes.iter().filter(|(_, n)| n.is_up).count(); - - let storage_nodes = ring - .layout - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) - .collect::>(); - let n_storage_nodes_ok = storage_nodes - .iter() - .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count(); - - let partitions = ring.partitions(); - let partitions_n_up = partitions - .iter() - .map(|(_, h)| { - let pn = ring.get_nodes(h, ring.replication_factor); - pn.iter() - .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count() - }) - .collect::>(); - let n_partitions_full_ok = partitions_n_up - .iter() - .filter(|c| **c == replication_factor) - .count(); - let n_partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count(); - - let (status, status_str) = if n_partitions_quorum == partitions.len() - && n_storage_nodes_ok == storage_nodes.len() - { - (StatusCode::OK, "Garage is fully operational") - } else if n_partitions_quorum == partitions.len() { - ( + let (status, status_str) = match health.status { + ClusterHealthStatus::Healthy => (StatusCode::OK, "Garage is fully operational"), + ClusterHealthStatus::Degraded => ( StatusCode::OK, "Garage is operational but some storage nodes are unavailable", - ) - } else { - ( + ), + ClusterHealthStatus::Unavailable => ( StatusCode::SERVICE_UNAVAILABLE, "Quorum is not available for some/all partitions, reads and writes will fail", - ) + ), }; - let mut buf = status_str.to_string(); - writeln!( - &mut buf, - "\nAll nodes: {} connected, {} known", - n_nodes_connected, - nodes.len() - ) - .unwrap(); - writeln!( - &mut buf, - "Storage nodes: {} connected, {} in layout", - n_storage_nodes_ok, - storage_nodes.len() - ) - .unwrap(); - writeln!(&mut buf, "Number of partitions: {}", partitions.len()).unwrap(); - writeln!(&mut buf, "Partitions with quorum: {}", n_partitions_quorum).unwrap(); - writeln!( - &mut buf, - "Partitions with all nodes available: {}", - n_partitions_full_ok - ) - .unwrap(); + let resp = Response::builder().status(status); - Ok(Response::builder() - .status(status) - .header(http::header::CONTENT_TYPE, "text/plain") - .body(Body::from(buf))?) + if matches!(format, Some("json")) { + let resp_json = + serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?; + Ok(resp + .header(http::header::CONTENT_TYPE, "application/json") + .body(Body::from(resp_json))?) + } else { + let mut buf = status_str.to_string(); + writeln!( + &mut buf, + "\nAll nodes: {} connected, {} known", + health.connected_nodes, health.known_nodes, + ) + .unwrap(); + writeln!( + &mut buf, + "Storage nodes: {} connected, {} in layout", + health.storage_nodes_ok, health.storage_nodes + ) + .unwrap(); + writeln!(&mut buf, "Number of partitions: {}", health.partitions).unwrap(); + writeln!( + &mut buf, + "Partitions with quorum: {}", + health.partitions_quorum + ) + .unwrap(); + writeln!( + &mut buf, + "Partitions with all nodes available: {}", + health.partitions_all_ok + ) + .unwrap(); + + Ok(resp + .header(http::header::CONTENT_TYPE, "text/plain") + .body(Body::from(buf))?) + } } fn handle_metrics(&self) -> Result, Error> { @@ -240,7 +207,7 @@ impl ApiHandler for AdminApiServer { match endpoint { Endpoint::Options => self.handle_options(&req), - Endpoint::Health => self.handle_health(), + Endpoint::Health { format } => self.handle_health(format.as_deref()), Endpoint::Metrics => self.handle_metrics(), Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await, Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await, diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs index 14411f75..6ffcc131 100644 --- a/src/api/admin/router.rs +++ b/src/api/admin/router.rs @@ -17,7 +17,9 @@ router_match! {@func #[derive(Debug, Clone, PartialEq, Eq)] pub enum Endpoint { Options, - Health, + Health { + format: Option, + }, Metrics, GetClusterStatus, ConnectClusterNodes, @@ -90,7 +92,7 @@ impl Endpoint { let res = router_match!(@gen_path_parser (req.method(), path, query) [ OPTIONS _ => Options, - GET "/health" => Health, + GET "/health" => Health (query_opt::format), GET "/metrics" => Metrics, GET "/v0/status" => GetClusterStatus, POST "/v0/connect" => ConnectClusterNodes, @@ -133,7 +135,7 @@ impl Endpoint { /// Get the kind of authorization which is required to perform the operation. pub fn authorization_type(&self) -> Authorization { match self { - Self::Health => Authorization::None, + Self::Health { .. } => Authorization::None, Self::Metrics => Authorization::MetricsToken, _ => Authorization::AdminToken, } @@ -141,6 +143,7 @@ impl Endpoint { } generateQueryParameters! { + "format" => format, "id" => id, "search" => search, "globalAlias" => global_alias, -- cgit v1.2.3 From d7868c48a4d8d5831051a0be088fe7bbec259bca Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 5 Dec 2022 15:38:32 +0100 Subject: Separate /health (simple text answer) and /v0/health (full json answer, authenticated) --- src/api/admin/api_server.rs | 54 +++++++++------------------------------------ src/api/admin/cluster.rs | 17 ++++++++++++++ src/api/admin/router.rs | 10 ++++----- 3 files changed, 33 insertions(+), 48 deletions(-) (limited to 'src/api/admin') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index f86ed599..2d325fb1 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -1,4 +1,3 @@ -use std::fmt::Write; use std::net::SocketAddr; use std::sync::Arc; @@ -78,7 +77,7 @@ impl AdminApiServer { .body(Body::empty())?) } - fn handle_health(&self, format: Option<&str>) -> Result, Error> { + fn handle_health(&self) -> Result, Error> { let health = self.garage.system.health(); let (status, status_str) = match health.status { @@ -92,47 +91,15 @@ impl AdminApiServer { "Quorum is not available for some/all partitions, reads and writes will fail", ), }; + let status_str = format!( + "{}\nConsult the full health check API endpoint at /v0/health for more details\n", + status_str + ); - let resp = Response::builder().status(status); - - if matches!(format, Some("json")) { - let resp_json = - serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?; - Ok(resp - .header(http::header::CONTENT_TYPE, "application/json") - .body(Body::from(resp_json))?) - } else { - let mut buf = status_str.to_string(); - writeln!( - &mut buf, - "\nAll nodes: {} connected, {} known", - health.connected_nodes, health.known_nodes, - ) - .unwrap(); - writeln!( - &mut buf, - "Storage nodes: {} connected, {} in layout", - health.storage_nodes_ok, health.storage_nodes - ) - .unwrap(); - writeln!(&mut buf, "Number of partitions: {}", health.partitions).unwrap(); - writeln!( - &mut buf, - "Partitions with quorum: {}", - health.partitions_quorum - ) - .unwrap(); - writeln!( - &mut buf, - "Partitions with all nodes available: {}", - health.partitions_all_ok - ) - .unwrap(); - - Ok(resp - .header(http::header::CONTENT_TYPE, "text/plain") - .body(Body::from(buf))?) - } + Ok(Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "text/plain") + .body(Body::from(status_str))?) } fn handle_metrics(&self) -> Result, Error> { @@ -207,9 +174,10 @@ impl ApiHandler for AdminApiServer { match endpoint { Endpoint::Options => self.handle_options(&req), - Endpoint::Health { format } => self.handle_health(format.as_deref()), + Endpoint::Health => self.handle_health(), Endpoint::Metrics => self.handle_metrics(), Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await, + Endpoint::GetClusterHealth => handle_get_cluster_health(&self.garage).await, Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await, // Layout Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 706db727..a773e27a 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -9,6 +9,7 @@ use garage_util::crdt::*; use garage_util::data::*; use garage_rpc::layout::*; +use garage_rpc::system::ClusterHealthStatus; use garage_model::garage::Garage; @@ -43,6 +44,22 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { + let health = garage.system.health(); + + let status = match health.status { + ClusterHealthStatus::Unavailable => StatusCode::SERVICE_UNAVAILABLE, + _ => StatusCode::OK, + }; + + let resp_json = + serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?; + Ok(Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Body::from(resp_json))?) +} + pub async fn handle_connect_cluster_nodes( garage: &Arc, req: Request, diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs index 6ffcc131..3fa07b3c 100644 --- a/src/api/admin/router.rs +++ b/src/api/admin/router.rs @@ -17,11 +17,10 @@ router_match! {@func #[derive(Debug, Clone, PartialEq, Eq)] pub enum Endpoint { Options, - Health { - format: Option, - }, + Health, Metrics, GetClusterStatus, + GetClusterHealth, ConnectClusterNodes, // Layout GetClusterLayout, @@ -92,9 +91,10 @@ impl Endpoint { let res = router_match!(@gen_path_parser (req.method(), path, query) [ OPTIONS _ => Options, - GET "/health" => Health (query_opt::format), + GET "/health" => Health, GET "/metrics" => Metrics, GET "/v0/status" => GetClusterStatus, + GET "/v0/health" => GetClusterHealth, POST "/v0/connect" => ConnectClusterNodes, // Layout endpoints GET "/v0/layout" => GetClusterLayout, @@ -135,7 +135,7 @@ impl Endpoint { /// Get the kind of authorization which is required to perform the operation. pub fn authorization_type(&self) -> Authorization { match self { - Self::Health { .. } => Authorization::None, + Self::Health => Authorization::None, Self::Metrics => Authorization::MetricsToken, _ => Authorization::AdminToken, } -- cgit v1.2.3 From 5ea5fd2130ee16a2030f6f74cd1a8b641b1c2487 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 11 Dec 2022 18:11:12 +0100 Subject: Always return 200 OK on /v0/health, reinstate admin api doc as draft and complete it --- src/api/admin/cluster.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'src/api/admin') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index a773e27a..f4bf8b7b 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -47,15 +47,10 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { let health = garage.system.health(); - let status = match health.status { - ClusterHealthStatus::Unavailable => StatusCode::SERVICE_UNAVAILABLE, - _ => StatusCode::OK, - }; - let resp_json = serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?; Ok(Response::builder() - .status(status) + .status(StatusCode::OK) .header(http::header::CONTENT_TYPE, "application/json") .body(Body::from(resp_json))?) } -- cgit v1.2.3 From 533afcf4e13022c46fd21ec51ca2a9969692ef4c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 11 Dec 2022 18:17:08 +0100 Subject: simplify --- src/api/admin/cluster.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'src/api/admin') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index f4bf8b7b..182a4f6f 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -9,7 +9,6 @@ use garage_util::crdt::*; use garage_util::data::*; use garage_rpc::layout::*; -use garage_rpc::system::ClusterHealthStatus; use garage_model::garage::Garage; @@ -46,13 +45,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { let health = garage.system.health(); - - let resp_json = - serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .header(http::header::CONTENT_TYPE, "application/json") - .body(Body::from(resp_json))?) + Ok(json_ok_response(&health)?) } pub async fn handle_connect_cluster_nodes( -- cgit v1.2.3