diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/admin/Cargo.toml | 1 | ||||
-rw-r--r-- | src/admin/lib.rs | 1 | ||||
-rw-r--r-- | src/admin/metrics.rs | 21 | ||||
-rw-r--r-- | src/garage/tests/common/garage.rs | 4 | ||||
-rw-r--r-- | src/model/Cargo.toml | 1 | ||||
-rw-r--r-- | src/model/block.rs | 55 | ||||
-rw-r--r-- | src/model/block_metrics.rs | 93 | ||||
-rw-r--r-- | src/model/lib.rs | 1 | ||||
-rw-r--r-- | src/rpc/Cargo.toml | 3 | ||||
-rw-r--r-- | src/rpc/lib.rs | 1 | ||||
-rw-r--r-- | src/rpc/metrics.rs | 55 | ||||
-rw-r--r-- | src/rpc/rpc_helper.rs | 41 | ||||
-rw-r--r-- | src/table/data.rs | 6 | ||||
-rw-r--r-- | src/table/metrics.rs | 75 | ||||
-rw-r--r-- | src/table/sync.rs | 32 | ||||
-rw-r--r-- | src/table/table.rs | 34 |
16 files changed, 391 insertions, 33 deletions
diff --git a/src/admin/Cargo.toml b/src/admin/Cargo.toml index 9775b667..6f646869 100644 --- a/src/admin/Cargo.toml +++ b/src/admin/Cargo.toml @@ -25,4 +25,3 @@ log = "0.4" opentelemetry = "0.17" opentelemetry-prometheus = "0.10" prometheus = "0.13" -lazy_static = "1.4" diff --git a/src/admin/lib.rs b/src/admin/lib.rs index 443361be..f1e8ddd7 100644 --- a/src/admin/lib.rs +++ b/src/admin/lib.rs @@ -1,6 +1,5 @@ //! Crate for handling the admin and metric HTTP APIs #[macro_use] extern crate log; -extern crate lazy_static; pub mod metrics; diff --git a/src/admin/metrics.rs b/src/admin/metrics.rs index ccc26d26..44fd4cb2 100644 --- a/src/admin/metrics.rs +++ b/src/admin/metrics.rs @@ -3,11 +3,9 @@ use hyper::{ service::{make_service_fn, service_fn}, Body, Method, Request, Response, Server, }; -use lazy_static::lazy_static; use opentelemetry::{ global, metrics::{BoundCounter, BoundValueRecorder}, - KeyValue, }; use opentelemetry_prometheus::PrometheusExporter; use prometheus::{Encoder, TextEncoder}; @@ -19,11 +17,6 @@ use futures::future::*; use garage_model::garage::Garage; use garage_util::error::Error as GarageError; -lazy_static! { - // This defines the differennt tags that will be referenced by the object - static ref HANDLER_ALL: [KeyValue; 1] = [KeyValue::new("handler", "all")]; -} - // serve_req on metric endpoint async fn serve_req( req: Request<Body>, @@ -87,20 +80,20 @@ impl AdminServer { exporter, metrics: AdminServerMetrics { http_counter: meter - .u64_counter("router.http_requests_total") + .u64_counter("admin.http_requests_total") .with_description("Total number of HTTP requests made.") .init() - .bind(HANDLER_ALL.as_ref()), + .bind(&[]), http_body_gauge: meter - .u64_value_recorder("example.http_response_size_bytes") + .u64_value_recorder("admin.http_response_size_bytes") .with_description("The metrics HTTP response sizes in bytes.") .init() - .bind(HANDLER_ALL.as_ref()), + .bind(&[]), http_req_histogram: meter - .f64_value_recorder("example.http_request_duration_seconds") + .f64_value_recorder("admin.http_request_duration_seconds") .with_description("The HTTP request latencies in seconds.") .init() - .bind(HANDLER_ALL.as_ref()), + .bind(&[]), }, } } @@ -125,7 +118,7 @@ impl AdminServer { let addr = &garage.config.admin_api.bind_addr; - let server = Server::bind(&addr).serve(make_svc); + let server = Server::bind(addr).serve(make_svc); let graceful = server.with_graceful_shutdown(shutdown_signal); info!("Admin server listening on http://{}", addr); diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index 92aa2edf..12cf946b 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -65,6 +65,9 @@ root_domain = ".s3.garage" bind_addr = "127.0.0.1:{web_port}" root_domain = ".web.garage" index = "index.html" + +[admin_api] +bind_addr = "127.0.0.1:{admin_port}" "#, path = path.display(), secret = GARAGE_TEST_SECRET, @@ -72,6 +75,7 @@ index = "index.html" api_port = port, rpc_port = port + 1, web_port = port + 2, + admin_port = port + 3, ); fs::write(path.join("config.toml"), config).expect("Could not write garage config file"); diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 14e49557..10a4c838 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -36,6 +36,7 @@ serde_bytes = "0.11" futures = "0.3" futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } +opentelemetry = "0.17" #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } netapp = "0.3.0" diff --git a/src/model/block.rs b/src/model/block.rs index 1173c7b3..9e939c24 100644 --- a/src/model/block.rs +++ b/src/model/block.rs @@ -1,12 +1,13 @@ use std::convert::TryInto; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; use async_trait::async_trait; use futures::future::*; use futures::select; +use opentelemetry::KeyValue; use serde::{Deserialize, Serialize}; use tokio::fs; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -23,8 +24,8 @@ use garage_rpc::*; use garage_table::replication::{TableReplication, TableShardedReplication}; +use crate::block_metrics::*; use crate::block_ref_table::*; - use crate::garage::Garage; /// Size under which data will be stored inlined in database instead of as files @@ -154,6 +155,8 @@ pub struct BlockManager { system: Arc<System>, endpoint: Arc<Endpoint<BlockRpc, Self>>, pub(crate) garage: ArcSwapOption<Garage>, + + metrics: BlockManagerMetrics, } // This custom struct contains functions that must only be ran @@ -182,6 +185,8 @@ impl BlockManager { let manager_locked = BlockManagerLocked(); + let metrics = BlockManagerMetrics::new(resync_queue.clone()); + let block_manager = Arc::new(Self { replication, data_dir, @@ -192,6 +197,7 @@ impl BlockManager { system, endpoint, garage: ArcSwapOption::from(None), + metrics, }); block_manager.endpoint.set_handler(block_manager.clone()); @@ -380,15 +386,28 @@ impl BlockManager { /// Write a block to disk async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<BlockRpc, Error> { - self.mutation_lock + let request_start = SystemTime::now(); + let write_size = data.inner_buffer().len() as u64; + + let res = self + .mutation_lock .lock() .await .write_block(hash, data, self) - .await + .await?; + + self.metrics.bytes_written.add(write_size); + self.metrics + .block_write_duration + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); + + Ok(res) } /// Read block from disk, verifying it's integrity async fn read_block(&self, hash: &Hash) -> Result<BlockRpc, Error> { + let request_start = SystemTime::now(); + let mut path = self.block_path(hash); let compressed = match self.is_block_compressed(hash).await { Ok(c) => c, @@ -414,6 +433,8 @@ impl BlockManager { }; if data.verify(*hash).is_err() { + self.metrics.corruption_counter.add(1); + self.mutation_lock .lock() .await @@ -423,6 +444,13 @@ impl BlockManager { return Err(Error::CorruptData(*hash)); } + self.metrics + .bytes_read + .add(data.inner_buffer().len() as u64); + self.metrics + .block_read_duration + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); + Ok(BlockRpc::PutBlock { hash: *hash, data }) } @@ -521,9 +549,18 @@ impl BlockManager { let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap()); let now = now_msec(); if now >= time_msec { + let start_time = SystemTime::now(); + let hash = Hash::try_from(&hash_bytes[..]).unwrap(); let res = self.resync_block(&hash).await; + + self.metrics.resync_counter.add(1); + self.metrics + .resync_duration + .record(start_time.elapsed().map_or(0.0, |d| d.as_secs_f64())); + if let Err(e) = &res { + self.metrics.resync_error_counter.add(1); warn!("Error when resyncing {:?}: {}", hash, e); self.put_to_resync(&hash, RESYNC_RETRY_DELAY)?; } @@ -607,6 +644,12 @@ impl BlockManager { need_nodes.len() ); + for node in need_nodes.iter() { + self.metrics + .resync_send_counter + .add(1, &[KeyValue::new("to", format!("{:?}", node))]); + } + let put_block_message = self.read_block(hash).await?; self.system .rpc @@ -644,6 +687,9 @@ impl BlockManager { ); let block_data = self.rpc_get_raw_block(hash).await?; + + self.metrics.resync_recv_counter.add(1); + self.write_block(hash, &block_data).await?; } @@ -819,6 +865,7 @@ impl BlockManagerLocked { path.set_extension("zst"); } fs::remove_file(path).await?; + mgr.metrics.delete_counter.add(1); } Ok(()) } diff --git a/src/model/block_metrics.rs b/src/model/block_metrics.rs new file mode 100644 index 00000000..7ef9a117 --- /dev/null +++ b/src/model/block_metrics.rs @@ -0,0 +1,93 @@ +use opentelemetry::{global, metrics::*}; + +/// TableMetrics reference all counter used for metrics +pub struct BlockManagerMetrics { + pub(crate) _resync_queue_len: ValueObserver<u64>, + + pub(crate) resync_counter: BoundCounter<u64>, + pub(crate) resync_error_counter: BoundCounter<u64>, + pub(crate) resync_duration: BoundValueRecorder<f64>, + pub(crate) resync_send_counter: Counter<u64>, + pub(crate) resync_recv_counter: BoundCounter<u64>, + + pub(crate) bytes_read: BoundCounter<u64>, + pub(crate) block_read_duration: BoundValueRecorder<f64>, + pub(crate) bytes_written: BoundCounter<u64>, + pub(crate) block_write_duration: BoundValueRecorder<f64>, + pub(crate) delete_counter: BoundCounter<u64>, + + pub(crate) corruption_counter: BoundCounter<u64>, +} + +impl BlockManagerMetrics { + pub fn new(resync_queue: sled::Tree) -> Self { + let meter = global::meter("garage_model/block"); + Self { + _resync_queue_len: meter + .u64_value_observer("block.resync_queue_length", move |observer| { + observer.observe(resync_queue.len() as u64, &[]) + }) + .with_description( + "Number of block hashes queued for local check and possible resync", + ) + .init(), + + resync_counter: meter + .u64_counter("block.resync_counter") + .with_description("Number of calls to resync_block") + .init() + .bind(&[]), + resync_error_counter: meter + .u64_counter("block.resync_error_counter") + .with_description("Number of calls to resync_block that returned an error") + .init() + .bind(&[]), + resync_duration: meter + .f64_value_recorder("block.resync_duration") + .with_description("Duration of resync_block operations") + .init() + .bind(&[]), + resync_send_counter: meter + .u64_counter("block.resync_send_counter") + .with_description("Number of blocks sent to another node in resync operations") + .init(), + resync_recv_counter: meter + .u64_counter("block.resync_recv_counter") + .with_description("Number of blocks received from other nodes in resync operations") + .init() + .bind(&[]), + + bytes_read: meter + .u64_counter("block.bytes_read") + .with_description("Number of bytes read from disk") + .init() + .bind(&[]), + block_read_duration: meter + .f64_value_recorder("block.read_duration") + .with_description("Duration of block read operations") + .init() + .bind(&[]), + bytes_written: meter + .u64_counter("block.bytes_written") + .with_description("Number of bytes written to disk") + .init() + .bind(&[]), + block_write_duration: meter + .f64_value_recorder("block.write_duration") + .with_description("Duration of block write operations") + .init() + .bind(&[]), + delete_counter: meter + .u64_counter("block.delete_counter") + .with_description("Number of blocks deleted") + .init() + .bind(&[]), + + corruption_counter: meter + .u64_counter("block.corruption_counter") + .with_description("Data corruptions detected on block reads") + .init() + .bind(&[]), + } + } +} diff --git a/src/model/lib.rs b/src/model/lib.rs index 9deaae9d..c8677603 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -11,6 +11,7 @@ pub mod object_table; pub mod version_table; pub mod block; +mod block_metrics; pub mod garage; pub mod helper; diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index f06606e5..57b61a08 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -43,8 +43,9 @@ futures = "0.3" futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } tokio-stream = { version = "0.1", features = ["net"] } +opentelemetry = "0.17" #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } -netapp = "0.3.0" +netapp = "0.3.1" hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 2c877a7f..736513f4 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -10,6 +10,7 @@ pub mod layout; pub mod ring; pub mod system; +mod metrics; pub mod rpc_helper; pub use rpc_helper::*; diff --git a/src/rpc/metrics.rs b/src/rpc/metrics.rs new file mode 100644 index 00000000..c900518c --- /dev/null +++ b/src/rpc/metrics.rs @@ -0,0 +1,55 @@ +use std::sync::Arc; + +use opentelemetry::{global, metrics::*}; +use tokio::sync::Semaphore; + +/// TableMetrics reference all counter used for metrics +pub struct RpcMetrics { + pub(crate) _rpc_available_permits: ValueObserver<u64>, + + pub(crate) rpc_counter: Counter<u64>, + pub(crate) rpc_timeout_counter: Counter<u64>, + pub(crate) rpc_netapp_error_counter: Counter<u64>, + pub(crate) rpc_garage_error_counter: Counter<u64>, + + pub(crate) rpc_duration: ValueRecorder<f64>, + pub(crate) rpc_queueing_time: ValueRecorder<f64>, +} +impl RpcMetrics { + pub fn new(sem: Arc<Semaphore>) -> Self { + let meter = global::meter("garage_rpc"); + RpcMetrics { + _rpc_available_permits: meter + .u64_value_observer("rpc.available_permits", move |observer| { + observer.observe(sem.available_permits() as u64, &[]) + }) + .with_description("Number of available RPC permits") + .init(), + + rpc_counter: meter + .u64_counter("rpc.request_counter") + .with_description("Number of RPC requests emitted") + .init(), + rpc_timeout_counter: meter + .u64_counter("rpc.timeout_counter") + .with_description("Number of RPC timeouts") + .init(), + rpc_netapp_error_counter: meter + .u64_counter("rpc.netapp_error_counter") + .with_description("Number of communication errors (errors in the Netapp library)") + .init(), + rpc_garage_error_counter: meter + .u64_counter("rpc.garage_error_counter") + .with_description("Number of RPC errors (errors happening when handling the RPC)") + .init(), + rpc_duration: meter + .f64_value_recorder("rpc.duration") + .with_description("Duration of RPCs") + .init(), + rpc_queueing_time: meter + .f64_value_recorder("rpc.queueing_time") + .with_description("Time RPC requests were queued for before being sent") + .init(), + } + } +} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 68bdfc4f..0d722e43 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,11 +1,12 @@ //! Contain structs related to making RPCs use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use futures::future::join_all; use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::StreamExt; use futures_util::future::FutureExt; +use opentelemetry::KeyValue; use tokio::select; use tokio::sync::{watch, Semaphore}; @@ -18,6 +19,7 @@ use garage_util::background::BackgroundRunner; use garage_util::data::*; use garage_util::error::Error; +use crate::metrics::RpcMetrics; use crate::ring::Ring; const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); @@ -76,7 +78,8 @@ struct RpcHelperInner { fullmesh: Arc<FullMeshPeeringStrategy>, background: Arc<BackgroundRunner>, ring: watch::Receiver<Arc<Ring>>, - request_buffer_semaphore: Semaphore, + request_buffer_semaphore: Arc<Semaphore>, + metrics: RpcMetrics, } impl RpcHelper { @@ -86,12 +89,17 @@ impl RpcHelper { background: Arc<BackgroundRunner>, ring: watch::Receiver<Arc<Ring>>, ) -> Self { + let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE)); + + let metrics = RpcMetrics::new(sem.clone()); + Self(Arc::new(RpcHelperInner { our_node_id, fullmesh, background, ring, - request_buffer_semaphore: Semaphore::new(REQUEST_BUFFER_SIZE), + request_buffer_semaphore: sem, + metrics, })) } @@ -120,6 +128,9 @@ impl RpcHelper { M: Rpc<Response = Result<S, Error>>, H: EndpointHandler<M>, { + let queueing_start_time = SystemTime::now(); + let metric_tags = [KeyValue::new("endpoint", endpoint.path().to_string())]; + let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32; let permit = self .0 @@ -127,14 +138,36 @@ impl RpcHelper { .acquire_many(msg_size) .await?; + self.0.metrics.rpc_queueing_time.record( + queueing_start_time + .elapsed() + .map_or(0.0, |d| d.as_secs_f64()), + &metric_tags, + ); + self.0.metrics.rpc_counter.add(1, &metric_tags); + let rpc_start_time = SystemTime::now(); + let node_id = to.into(); select! { res = endpoint.call(&node_id, &msg, strat.rs_priority) => { drop(permit); - Ok(res??) + + if res.is_err() { + self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags); + } + let res = res?; + + self.0.metrics.rpc_duration + .record(rpc_start_time.elapsed().map_or(0.0, |d| d.as_secs_f64()), &metric_tags); + if res.is_err() { + self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags); + } + + Ok(res?) } _ = tokio::time::sleep(strat.rs_timeout) => { drop(permit); + self.0.metrics.rpc_timeout_counter.add(1, &metric_tags); Err(Error::Timeout) } } diff --git a/src/table/data.rs b/src/table/data.rs index a5209c26..684afdcd 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -54,7 +54,7 @@ where .open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME)) .expect("Unable to open DB tree"); - let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone()); + let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone(), gc_todo.clone()); Arc::new(Self { system, @@ -171,6 +171,8 @@ where })?; if let Some((old_entry, new_entry, new_bytes_hash)) = changed { + self.metrics.internal_update_counter.add(1); + let is_tombstone = new_entry.is_tombstone(); self.instance.updated(old_entry, Some(new_entry)); self.merkle_todo_notify.notify_one(); @@ -205,6 +207,8 @@ where })?; if removed { + self.metrics.internal_delete_counter.add(1); + let old_entry = self.decode_entry(v)?; self.instance.updated(Some(old_entry), None); self.merkle_todo_notify.notify_one(); diff --git a/src/table/metrics.rs b/src/table/metrics.rs index 38e93904..548bf0d6 100644 --- a/src/table/metrics.rs +++ b/src/table/metrics.rs @@ -2,15 +2,27 @@ use opentelemetry::{global, metrics::*, KeyValue}; /// TableMetrics reference all counter used for metrics pub struct TableMetrics { - merkle_updater_todo_queue_length: ValueObserver<u64>, + pub(crate) _merkle_todo_len: ValueObserver<u64>, + pub(crate) _gc_todo_len: ValueObserver<u64>, + + pub(crate) get_request_counter: BoundCounter<u64>, + pub(crate) get_request_duration: BoundValueRecorder<f64>, + pub(crate) put_request_counter: BoundCounter<u64>, + pub(crate) put_request_duration: BoundValueRecorder<f64>, + + pub(crate) internal_update_counter: BoundCounter<u64>, + pub(crate) internal_delete_counter: BoundCounter<u64>, + + pub(crate) sync_items_sent: Counter<u64>, + pub(crate) sync_items_received: Counter<u64>, } impl TableMetrics { - pub fn new(table_name: &'static str, merkle_todo: sled::Tree) -> Self { + pub fn new(table_name: &'static str, merkle_todo: sled::Tree, gc_todo: sled::Tree) -> Self { let meter = global::meter(table_name); TableMetrics { - merkle_updater_todo_queue_length: meter + _merkle_todo_len: meter .u64_value_observer( - format!("merkle_updater_todo_queue_length"), + "table.merkle_updater_todo_queue_length", move |observer| { observer.observe( merkle_todo.len() as u64, @@ -18,7 +30,60 @@ impl TableMetrics { ) }, ) - .with_description("Bucket merkle updater TODO queue length") + .with_description("Merkle tree updater TODO queue length") + .init(), + _gc_todo_len: meter + .u64_value_observer( + "table.gc_todo_queue_length", + move |observer| { + observer.observe( + gc_todo.len() as u64, + &[KeyValue::new("table_name", table_name)], + ) + }, + ) + .with_description("Table garbage collector TODO queue length") + .init(), + + get_request_counter: meter + .u64_counter("table.get_request_counter") + .with_description("Number of get/get_range requests internally made on this table") + .init() + .bind(&[KeyValue::new("table_name", table_name)]), + get_request_duration: meter + .f64_value_recorder("table.get_request_duration") + .with_description("Duration of get/get_range requests internally made on this table, in seconds") + .init() + .bind(&[KeyValue::new("table_name", table_name)]), + put_request_counter: meter + .u64_counter("table.put_request_counter") + .with_description("Number of insert/insert_many requests internally made on this table") + .init() + .bind(&[KeyValue::new("table_name", table_name)]), + put_request_duration: meter + .f64_value_recorder("table.put_request_duration") + .with_description("Duration of insert/insert_many requests internally made on this table, in seconds") + .init() + .bind(&[KeyValue::new("table_name", table_name)]), + + internal_update_counter: meter + .u64_counter("table.internal_update_counter") + .with_description("Number of value updates where the value actually changes (includes creation of new key and update of existing key)") + .init() + .bind(&[KeyValue::new("table_name", table_name)]), + internal_delete_counter: meter + .u64_counter("table.internal_delete_counter") + .with_description("Number of value deletions in the tree (due to GC or repartitioning)") + .init() + .bind(&[KeyValue::new("table_name", table_name)]), + + sync_items_sent: meter + .u64_counter("table.sync_items_sent") + .with_description("Number of data items sent to other nodes during resync procedures") + .init(), + sync_items_received: meter + .u64_counter("table.sync_items_received") + .with_description("Number of data items received from other nodes during resync procedures") .init(), } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 1df2b01d..08069ad0 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -6,6 +6,7 @@ use async_trait::async_trait; use futures::select; use futures_util::future::*; use futures_util::stream::*; +use opentelemetry::KeyValue; use rand::Rng; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; @@ -312,6 +313,16 @@ where ) -> Result<(), Error> { let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>(); + for to in nodes.iter() { + self.data.metrics.sync_items_sent.add( + values.len() as u64, + &[ + KeyValue::new("table_name", F::TABLE_NAME), + KeyValue::new("to", format!("{:?}", to)), + ], + ); + } + self.system .rpc .try_call_many( @@ -500,6 +511,14 @@ where .map(|x| Arc::new(ByteBuf::from(x))) .collect::<Vec<_>>(); + self.data.metrics.sync_items_sent.add( + values.len() as u64, + &[ + KeyValue::new("table_name", F::TABLE_NAME), + KeyValue::new("to", format!("{:?}", who)), + ], + ); + let rpc_resp = self .system .rpc @@ -527,7 +546,7 @@ where F: TableSchema + 'static, R: TableReplication + 'static, { - async fn handle(self: &Arc<Self>, message: &SyncRpc, _from: NodeID) -> Result<SyncRpc, Error> { + async fn handle(self: &Arc<Self>, message: &SyncRpc, from: NodeID) -> Result<SyncRpc, Error> { match message { SyncRpc::RootCkHash(range, h) => { let (_root_ck_key, root_ck) = self.get_root_ck(*range)?; @@ -539,6 +558,17 @@ where Ok(SyncRpc::Node(k.clone(), node)) } SyncRpc::Items(items) => { + self.data.metrics.sync_items_received.add( + items.len() as u64, + &[ + KeyValue::new("table_name", F::TABLE_NAME), + KeyValue::new( + "from", + format!("{:?}", Uuid::try_from(from.as_ref()).unwrap()), + ), + ], + ); + self.data.update_many(items)?; Ok(SyncRpc::Ok) } diff --git a/src/table/table.rs b/src/table/table.rs index 01789c11..3ac3bc5b 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use async_trait::async_trait; use futures::stream::*; @@ -81,6 +81,8 @@ where } pub async fn insert(&self, e: &F::E) -> Result<(), Error> { + let request_start = SystemTime::now(); + let hash = e.partition_key().hash(); let who = self.data.replication.write_nodes(&hash); //eprintln!("insert who: {:?}", who); @@ -99,10 +101,18 @@ where .with_timeout(TABLE_RPC_TIMEOUT), ) .await?; + + self.data.metrics.put_request_counter.add(1); + self.data + .metrics + .put_request_duration + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); Ok(()) } pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> { + let request_start = SystemTime::now(); + let mut call_list: HashMap<_, Vec<_>> = HashMap::new(); for entry in entries.iter() { @@ -140,6 +150,12 @@ where if errors.len() > self.data.replication.max_write_errors() { Err(Error::Message("Too many errors".into())) } else { + self.data.metrics.put_request_counter.add(1); + self.data + .metrics + .put_request_duration + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); + Ok(()) } } @@ -149,6 +165,8 @@ where partition_key: &F::P, sort_key: &F::S, ) -> Result<Option<F::E>, Error> { + let request_start = SystemTime::now(); + let hash = partition_key.hash(); let who = self.data.replication.read_nodes(&hash); //eprintln!("get who: {:?}", who); @@ -198,6 +216,12 @@ where .spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await }); } } + + self.data.metrics.get_request_counter.add(1); + self.data + .metrics + .get_request_duration + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); Ok(ret) } @@ -208,6 +232,8 @@ where filter: Option<F::Filter>, limit: usize, ) -> Result<Vec<F::E>, Error> { + let request_start = SystemTime::now(); + let hash = partition_key.hash(); let who = self.data.replication.read_nodes(&hash); @@ -265,6 +291,12 @@ where .take(limit) .map(|(_k, v)| v.take().unwrap()) .collect::<Vec<_>>(); + + self.data.metrics.get_request_counter.add(1); + self.data + .metrics + .get_request_duration + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); Ok(ret_vec) } |