From d9a35359bf8903f37f5f7aa77421ed35b626e0af Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 22 Feb 2022 15:21:06 +0100 Subject: Add metrics to web endpoint --- src/admin/tracing_setup.rs | 3 +- src/api/api_server.rs | 21 ++++++------ src/util/metrics.rs | 9 +++++- src/web/Cargo.toml | 2 ++ src/web/web_server.rs | 80 +++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 98 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/admin/tracing_setup.rs b/src/admin/tracing_setup.rs index 83fa5622..c561d568 100644 --- a/src/admin/tracing_setup.rs +++ b/src/admin/tracing_setup.rs @@ -1,7 +1,7 @@ use std::time::Duration; use opentelemetry::sdk::{ - trace::{self, IdGenerator, Sampler}, + trace::{self, IdGenerator}, Resource, }; use opentelemetry::KeyValue; @@ -24,7 +24,6 @@ pub fn init_tracing(export_to: &str, node_id: Uuid) -> Result<(), Error> { .with_trace_config( trace::config() .with_id_generator(IdGenerator::default()) - .with_sampler(Sampler::TraceIdRatioBased(0.01f64)) .with_resource(Resource::new(vec![ KeyValue::new("service.name", "garage"), KeyValue::new("service.instance.id", node_id), diff --git a/src/api/api_server.rs b/src/api/api_server.rs index 8502d9d8..a6bf5a44 100644 --- a/src/api/api_server.rs +++ b/src/api/api_server.rs @@ -16,7 +16,7 @@ use opentelemetry::{ use garage_util::data::*; use garage_util::error::Error as GarageError; -use garage_util::metrics::RecordDuration; +use garage_util::metrics::{gen_trace_id, RecordDuration}; use garage_model::garage::Garage; use garage_model::key_table::Key; @@ -110,16 +110,12 @@ async fn handler( debug!("{:?}", req); let tracer = opentelemetry::global::tracer("garage"); - let trace_id = gen_uuid(); let span = tracer .span_builder("S3 API call (unknown)") - .with_trace_id( - opentelemetry::trace::TraceId::from_hex(&hex::encode(&trace_id.as_slice()[..16])) - .unwrap(), - ) + .with_trace_id(gen_trace_id()) .with_attributes(vec![ KeyValue::new("method", format!("{}", req.method())), - KeyValue::new("uri", req.uri().path().to_string()), + KeyValue::new("uri", req.uri().to_string()), ]) .start(&tracer); @@ -177,9 +173,14 @@ async fn handler_stage2( let (endpoint, bucket_name) = Endpoint::from_request(&req, bucket_name.map(ToOwned::to_owned))?; debug!("Endpoint: {:?}", endpoint); - Context::current() - .span() - .update_name::(format!("S3 API {}", endpoint.name())); + let current_context = Context::current(); + let current_span = current_context.span(); + current_span.update_name::(format!("S3 API {}", endpoint.name())); + current_span.set_attribute(KeyValue::new("endpoint", endpoint.name())); + current_span.set_attribute(KeyValue::new( + "bucket", + bucket_name.clone().unwrap_or_default(), + )); let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())]; diff --git a/src/util/metrics.rs b/src/util/metrics.rs index cd5aa182..1b05eabe 100644 --- a/src/util/metrics.rs +++ b/src/util/metrics.rs @@ -1,8 +1,9 @@ use std::time::SystemTime; use futures::{future::BoxFuture, Future, FutureExt}; +use rand::Rng; -use opentelemetry::{metrics::*, KeyValue}; +use opentelemetry::{metrics::*, trace::TraceId, KeyValue}; pub trait RecordDuration<'a>: 'a { type Output; @@ -48,3 +49,9 @@ where .boxed() } } + +// ---- + +pub fn gen_trace_id() -> TraceId { + rand::thread_rng().gen::<[u8; 16]>().into() +} diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 269e29a5..81a8c995 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -27,3 +27,5 @@ futures = "0.3" http = "0.2" hyper = { version = "0.14", features = ["server", "http1", "runtime", "tcp", "stream"] } + +opentelemetry = "0.17" diff --git a/src/web/web_server.rs b/src/web/web_server.rs index 80d2feb9..c51347a3 100644 --- a/src/web/web_server.rs +++ b/src/web/web_server.rs @@ -9,6 +9,13 @@ use hyper::{ Body, Method, Request, Response, Server, }; +use opentelemetry::{ + global, + metrics::{Counter, ValueRecorder}, + trace::{FutureExt, TraceContextExt, Tracer}, + Context, KeyValue, +}; + use crate::error::*; use garage_api::error::{Error as ApiError, OkOrBadRequest, OkOrInternalError}; @@ -20,6 +27,33 @@ use garage_model::garage::Garage; use garage_table::*; use garage_util::error::Error as GarageError; +use garage_util::metrics::{gen_trace_id, RecordDuration}; + +struct WebMetrics { + request_counter: Counter, + error_counter: Counter, + request_duration: ValueRecorder, +} + +impl WebMetrics { + fn new() -> Self { + let meter = global::meter("garage/web"); + Self { + request_counter: meter + .u64_counter("web.request_counter") + .with_description("Number of requests to the web endpoint") + .init(), + error_counter: meter + .u64_counter("web.error_counter") + .with_description("Number of requests to the web endpoint resulting in errors") + .init(), + request_duration: meter + .f64_value_recorder("web.request_duration") + .with_description("Duration of requests to the web endpoint") + .init(), + } + } +} /// Run a web server pub async fn run_web_server( @@ -28,13 +62,19 @@ pub async fn run_web_server( ) -> Result<(), GarageError> { let addr = &garage.config.s3_web.bind_addr; + let metrics = Arc::new(WebMetrics::new()); + let service = make_service_fn(|conn: &AddrStream| { let garage = garage.clone(); + let metrics = metrics.clone(); + let client_addr = conn.remote_addr(); async move { Ok::<_, Error>(service_fn(move |req: Request| { let garage = garage.clone(); - handle_request(garage, req, client_addr) + let metrics = metrics.clone(); + + handle_request(garage, metrics, req, client_addr) })) } }); @@ -49,23 +89,55 @@ pub async fn run_web_server( async fn handle_request( garage: Arc, + metrics: Arc, req: Request, addr: SocketAddr, ) -> Result, Infallible> { info!("{} {} {}", addr, req.method(), req.uri()); - match serve_file(garage, &req).await { + + // Lots of instrumentation + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer + .span_builder(format!("Web {} request", req.method())) + .with_trace_id(gen_trace_id()) + .with_attributes(vec![ + KeyValue::new("method", format!("{}", req.method())), + KeyValue::new("uri", req.uri().to_string()), + ]) + .start(&tracer); + + let metrics_tags = &[KeyValue::new("method", req.method().to_string())]; + + // The actual handler + let res = serve_file(garage, &req) + .with_context(Context::current_with_span(span)) + .record_duration(&metrics.request_duration, &metrics_tags[..]) + .await; + + // More instrumentation + metrics.request_counter.add(1, &metrics_tags[..]); + + // Returning the result + match res { Ok(res) => { - debug!("{} {} {}", req.method(), req.uri(), res.status()); + debug!("{} {} {}", req.method(), res.status(), req.uri()); Ok(res) } Err(error) => { info!( "{} {} {} {}", req.method(), - req.uri(), error.http_status_code(), + req.uri(), error ); + metrics.error_counter.add( + 1, + &[ + metrics_tags[0].clone(), + KeyValue::new("status_code", error.http_status_code().to_string()), + ], + ); Ok(error_to_res(error)) } } -- cgit v1.2.3