aboutsummaryrefslogtreecommitdiff
path: root/src/api
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-12-05 15:28:57 +0100
committerAlex Auvolat <alex@adnab.me>2022-12-05 15:28:57 +0100
commit280d1be7b1fde13d23e47f75aa8acd2f90efb81f (patch)
tree5a74e5bdef1cef54360b2b3ca57a53bf1ce61ba2 /src/api
parent2065f011ca3f7c736feecffd108c89d3f8019e85 (diff)
downloadgarage-280d1be7b1fde13d23e47f75aa8acd2f90efb81f.tar.gz
garage-280d1be7b1fde13d23e47f75aa8acd2f90efb81f.zip
Refactor health check and add ability to return it in json
Diffstat (limited to 'src/api')
-rw-r--r--src/api/admin/api_server.rs131
-rw-r--r--src/api/admin/router.rs9
2 files changed, 55 insertions, 85 deletions
diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs
index 9beeda1f..f86ed599 100644
--- a/src/api/admin/api_server.rs
+++ b/src/api/admin/api_server.rs
@@ -1,4 +1,3 @@
-use std::collections::HashMap;
use std::fmt::Write;
use std::net::SocketAddr;
use std::sync::Arc;
@@ -17,8 +16,7 @@ use opentelemetry_prometheus::PrometheusExporter;
use prometheus::{Encoder, TextEncoder};
use garage_model::garage::Garage;
-use garage_rpc::layout::NodeRoleV;
-use garage_util::data::Uuid;
+use garage_rpc::system::ClusterHealthStatus;
use garage_util::error::Error as GarageError;
use crate::generic_server::*;
@@ -80,92 +78,61 @@ impl AdminApiServer {
.body(Body::empty())?)
}
- fn handle_health(&self) -> Result<Response<Body>, Error> {
- let ring: Arc<_> = self.garage.system.ring.borrow().clone();
- let quorum = self.garage.replication_mode.write_quorum();
- let replication_factor = self.garage.replication_mode.replication_factor();
+ fn handle_health(&self, format: Option<&str>) -> Result<Response<Body>, Error> {
+ let health = self.garage.system.health();
- let nodes = self
- .garage
- .system
- .get_known_nodes()
- .into_iter()
- .map(|n| (n.id, n))
- .collect::<HashMap<Uuid, _>>();
- let n_nodes_connected = nodes.iter().filter(|(_, n)| n.is_up).count();
-
- let storage_nodes = ring
- .layout
- .roles
- .items()
- .iter()
- .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some()))
- .collect::<Vec<_>>();
- let n_storage_nodes_ok = storage_nodes
- .iter()
- .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
- .count();
-
- let partitions = ring.partitions();
- let partitions_n_up = partitions
- .iter()
- .map(|(_, h)| {
- let pn = ring.get_nodes(h, ring.replication_factor);
- pn.iter()
- .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
- .count()
- })
- .collect::<Vec<usize>>();
- let n_partitions_full_ok = partitions_n_up
- .iter()
- .filter(|c| **c == replication_factor)
- .count();
- let n_partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count();
-
- let (status, status_str) = if n_partitions_quorum == partitions.len()
- && n_storage_nodes_ok == storage_nodes.len()
- {
- (StatusCode::OK, "Garage is fully operational")
- } else if n_partitions_quorum == partitions.len() {
- (
+ let (status, status_str) = match health.status {
+ ClusterHealthStatus::Healthy => (StatusCode::OK, "Garage is fully operational"),
+ ClusterHealthStatus::Degraded => (
StatusCode::OK,
"Garage is operational but some storage nodes are unavailable",
- )
- } else {
- (
+ ),
+ ClusterHealthStatus::Unavailable => (
StatusCode::SERVICE_UNAVAILABLE,
"Quorum is not available for some/all partitions, reads and writes will fail",
- )
+ ),
};
- let mut buf = status_str.to_string();
- writeln!(
- &mut buf,
- "\nAll nodes: {} connected, {} known",
- n_nodes_connected,
- nodes.len()
- )
- .unwrap();
- writeln!(
- &mut buf,
- "Storage nodes: {} connected, {} in layout",
- n_storage_nodes_ok,
- storage_nodes.len()
- )
- .unwrap();
- writeln!(&mut buf, "Number of partitions: {}", partitions.len()).unwrap();
- writeln!(&mut buf, "Partitions with quorum: {}", n_partitions_quorum).unwrap();
- writeln!(
- &mut buf,
- "Partitions with all nodes available: {}",
- n_partitions_full_ok
- )
- .unwrap();
+ let resp = Response::builder().status(status);
- Ok(Response::builder()
- .status(status)
- .header(http::header::CONTENT_TYPE, "text/plain")
- .body(Body::from(buf))?)
+ if matches!(format, Some("json")) {
+ let resp_json =
+ serde_json::to_string_pretty(&health).map_err(garage_util::error::Error::from)?;
+ Ok(resp
+ .header(http::header::CONTENT_TYPE, "application/json")
+ .body(Body::from(resp_json))?)
+ } else {
+ let mut buf = status_str.to_string();
+ writeln!(
+ &mut buf,
+ "\nAll nodes: {} connected, {} known",
+ health.connected_nodes, health.known_nodes,
+ )
+ .unwrap();
+ writeln!(
+ &mut buf,
+ "Storage nodes: {} connected, {} in layout",
+ health.storage_nodes_ok, health.storage_nodes
+ )
+ .unwrap();
+ writeln!(&mut buf, "Number of partitions: {}", health.partitions).unwrap();
+ writeln!(
+ &mut buf,
+ "Partitions with quorum: {}",
+ health.partitions_quorum
+ )
+ .unwrap();
+ writeln!(
+ &mut buf,
+ "Partitions with all nodes available: {}",
+ health.partitions_all_ok
+ )
+ .unwrap();
+
+ Ok(resp
+ .header(http::header::CONTENT_TYPE, "text/plain")
+ .body(Body::from(buf))?)
+ }
}
fn handle_metrics(&self) -> Result<Response<Body>, Error> {
@@ -240,7 +207,7 @@ impl ApiHandler for AdminApiServer {
match endpoint {
Endpoint::Options => self.handle_options(&req),
- Endpoint::Health => self.handle_health(),
+ Endpoint::Health { format } => self.handle_health(format.as_deref()),
Endpoint::Metrics => self.handle_metrics(),
Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await,
Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await,
diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs
index 14411f75..6ffcc131 100644
--- a/src/api/admin/router.rs
+++ b/src/api/admin/router.rs
@@ -17,7 +17,9 @@ router_match! {@func
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Endpoint {
Options,
- Health,
+ Health {
+ format: Option<String>,
+ },
Metrics,
GetClusterStatus,
ConnectClusterNodes,
@@ -90,7 +92,7 @@ impl Endpoint {
let res = router_match!(@gen_path_parser (req.method(), path, query) [
OPTIONS _ => Options,
- GET "/health" => Health,
+ GET "/health" => Health (query_opt::format),
GET "/metrics" => Metrics,
GET "/v0/status" => GetClusterStatus,
POST "/v0/connect" => ConnectClusterNodes,
@@ -133,7 +135,7 @@ impl Endpoint {
/// Get the kind of authorization which is required to perform the operation.
pub fn authorization_type(&self) -> Authorization {
match self {
- Self::Health => Authorization::None,
+ Self::Health { .. } => Authorization::None,
Self::Metrics => Authorization::MetricsToken,
_ => Authorization::AdminToken,
}
@@ -141,6 +143,7 @@ impl Endpoint {
}
generateQueryParameters! {
+ "format" => format,
"id" => id,
"search" => search,
"globalAlias" => global_alias,