aboutsummaryrefslogtreecommitdiff
path: root/src/api/admin/api_server.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-12-05 14:59:15 +0100
committerAlex Auvolat <alex@adnab.me>2022-12-05 14:59:15 +0100
commit2065f011ca3f7c736feecffd108c89d3f8019e85 (patch)
tree86d761c1d5bea219ea2013a7e72762b42097291a /src/api/admin/api_server.rs
parent243b7c9a1cbf0e1dc4e9cf3b935bd833e29dcf9d (diff)
downloadgarage-2065f011ca3f7c736feecffd108c89d3f8019e85.tar.gz
garage-2065f011ca3f7c736feecffd108c89d3f8019e85.zip
Implement /health admin API endpoint to check node health
Diffstat (limited to 'src/api/admin/api_server.rs')
-rw-r--r--src/api/admin/api_server.rs94
1 files changed, 94 insertions, 0 deletions
diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs
index 2896d058..9beeda1f 100644
--- a/src/api/admin/api_server.rs
+++ b/src/api/admin/api_server.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+use std::fmt::Write;
use std::net::SocketAddr;
use std::sync::Arc;
@@ -15,6 +17,8 @@ use opentelemetry_prometheus::PrometheusExporter;
use prometheus::{Encoder, TextEncoder};
use garage_model::garage::Garage;
+use garage_rpc::layout::NodeRoleV;
+use garage_util::data::Uuid;
use garage_util::error::Error as GarageError;
use crate::generic_server::*;
@@ -76,6 +80,94 @@ impl AdminApiServer {
.body(Body::empty())?)
}
+ fn handle_health(&self) -> Result<Response<Body>, Error> {
+ let ring: Arc<_> = self.garage.system.ring.borrow().clone();
+ let quorum = self.garage.replication_mode.write_quorum();
+ let replication_factor = self.garage.replication_mode.replication_factor();
+
+ let nodes = self
+ .garage
+ .system
+ .get_known_nodes()
+ .into_iter()
+ .map(|n| (n.id, n))
+ .collect::<HashMap<Uuid, _>>();
+ let n_nodes_connected = nodes.iter().filter(|(_, n)| n.is_up).count();
+
+ let storage_nodes = ring
+ .layout
+ .roles
+ .items()
+ .iter()
+ .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some()))
+ .collect::<Vec<_>>();
+ let n_storage_nodes_ok = storage_nodes
+ .iter()
+ .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
+ .count();
+
+ let partitions = ring.partitions();
+ let partitions_n_up = partitions
+ .iter()
+ .map(|(_, h)| {
+ let pn = ring.get_nodes(h, ring.replication_factor);
+ pn.iter()
+ .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
+ .count()
+ })
+ .collect::<Vec<usize>>();
+ let n_partitions_full_ok = partitions_n_up
+ .iter()
+ .filter(|c| **c == replication_factor)
+ .count();
+ let n_partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count();
+
+ let (status, status_str) = if n_partitions_quorum == partitions.len()
+ && n_storage_nodes_ok == storage_nodes.len()
+ {
+ (StatusCode::OK, "Garage is fully operational")
+ } else if n_partitions_quorum == partitions.len() {
+ (
+ StatusCode::OK,
+ "Garage is operational but some storage nodes are unavailable",
+ )
+ } else {
+ (
+ StatusCode::SERVICE_UNAVAILABLE,
+ "Quorum is not available for some/all partitions, reads and writes will fail",
+ )
+ };
+
+ let mut buf = status_str.to_string();
+ writeln!(
+ &mut buf,
+ "\nAll nodes: {} connected, {} known",
+ n_nodes_connected,
+ nodes.len()
+ )
+ .unwrap();
+ writeln!(
+ &mut buf,
+ "Storage nodes: {} connected, {} in layout",
+ n_storage_nodes_ok,
+ storage_nodes.len()
+ )
+ .unwrap();
+ writeln!(&mut buf, "Number of partitions: {}", partitions.len()).unwrap();
+ writeln!(&mut buf, "Partitions with quorum: {}", n_partitions_quorum).unwrap();
+ writeln!(
+ &mut buf,
+ "Partitions with all nodes available: {}",
+ n_partitions_full_ok
+ )
+ .unwrap();
+
+ Ok(Response::builder()
+ .status(status)
+ .header(http::header::CONTENT_TYPE, "text/plain")
+ .body(Body::from(buf))?)
+ }
+
fn handle_metrics(&self) -> Result<Response<Body>, Error> {
#[cfg(feature = "metrics")]
{
@@ -124,6 +216,7 @@ impl ApiHandler for AdminApiServer {
) -> Result<Response<Body>, Error> {
let expected_auth_header =
match endpoint.authorization_type() {
+ Authorization::None => None,
Authorization::MetricsToken => self.metrics_token.as_ref(),
Authorization::AdminToken => match &self.admin_token {
None => return Err(Error::forbidden(
@@ -147,6 +240,7 @@ impl ApiHandler for AdminApiServer {
match endpoint {
Endpoint::Options => self.handle_options(&req),
+ Endpoint::Health => self.handle_health(),
Endpoint::Metrics => self.handle_metrics(),
Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await,
Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await,