aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-02-16 14:23:04 +0100
committerAlex Auvolat <alex@adnab.me>2022-03-14 10:51:50 +0100
commit2cab84b1fe423a41b356211e592a614c95ec4e0c (patch)
treec7dc3227feccbd6f4a8aba0bf8025201f3acc229 /src
parent1e2cf26373ef1812a3152a0057774f6381e66914 (diff)
downloadgarage-2cab84b1fe423a41b356211e592a614c95ec4e0c.tar.gz
garage-2cab84b1fe423a41b356211e592a614c95ec4e0c.zip
Add many metrics in table/ and rpc/
Diffstat (limited to 'src')
-rw-r--r--src/admin/Cargo.toml1
-rw-r--r--src/admin/lib.rs1
-rw-r--r--src/admin/metrics.rs21
-rw-r--r--src/garage/tests/common/garage.rs4
-rw-r--r--src/model/Cargo.toml1
-rw-r--r--src/model/block.rs55
-rw-r--r--src/model/block_metrics.rs93
-rw-r--r--src/model/lib.rs1
-rw-r--r--src/rpc/Cargo.toml3
-rw-r--r--src/rpc/lib.rs1
-rw-r--r--src/rpc/metrics.rs55
-rw-r--r--src/rpc/rpc_helper.rs41
-rw-r--r--src/table/data.rs6
-rw-r--r--src/table/metrics.rs75
-rw-r--r--src/table/sync.rs32
-rw-r--r--src/table/table.rs34
16 files changed, 391 insertions, 33 deletions
diff --git a/src/admin/Cargo.toml b/src/admin/Cargo.toml
index 9775b667..6f646869 100644
--- a/src/admin/Cargo.toml
+++ b/src/admin/Cargo.toml
@@ -25,4 +25,3 @@ log = "0.4"
opentelemetry = "0.17"
opentelemetry-prometheus = "0.10"
prometheus = "0.13"
-lazy_static = "1.4"
diff --git a/src/admin/lib.rs b/src/admin/lib.rs
index 443361be..f1e8ddd7 100644
--- a/src/admin/lib.rs
+++ b/src/admin/lib.rs
@@ -1,6 +1,5 @@
//! Crate for handling the admin and metric HTTP APIs
#[macro_use]
extern crate log;
-extern crate lazy_static;
pub mod metrics;
diff --git a/src/admin/metrics.rs b/src/admin/metrics.rs
index ccc26d26..44fd4cb2 100644
--- a/src/admin/metrics.rs
+++ b/src/admin/metrics.rs
@@ -3,11 +3,9 @@ use hyper::{
service::{make_service_fn, service_fn},
Body, Method, Request, Response, Server,
};
-use lazy_static::lazy_static;
use opentelemetry::{
global,
metrics::{BoundCounter, BoundValueRecorder},
- KeyValue,
};
use opentelemetry_prometheus::PrometheusExporter;
use prometheus::{Encoder, TextEncoder};
@@ -19,11 +17,6 @@ use futures::future::*;
use garage_model::garage::Garage;
use garage_util::error::Error as GarageError;
-lazy_static! {
- // This defines the differennt tags that will be referenced by the object
- static ref HANDLER_ALL: [KeyValue; 1] = [KeyValue::new("handler", "all")];
-}
-
// serve_req on metric endpoint
async fn serve_req(
req: Request<Body>,
@@ -87,20 +80,20 @@ impl AdminServer {
exporter,
metrics: AdminServerMetrics {
http_counter: meter
- .u64_counter("router.http_requests_total")
+ .u64_counter("admin.http_requests_total")
.with_description("Total number of HTTP requests made.")
.init()
- .bind(HANDLER_ALL.as_ref()),
+ .bind(&[]),
http_body_gauge: meter
- .u64_value_recorder("example.http_response_size_bytes")
+ .u64_value_recorder("admin.http_response_size_bytes")
.with_description("The metrics HTTP response sizes in bytes.")
.init()
- .bind(HANDLER_ALL.as_ref()),
+ .bind(&[]),
http_req_histogram: meter
- .f64_value_recorder("example.http_request_duration_seconds")
+ .f64_value_recorder("admin.http_request_duration_seconds")
.with_description("The HTTP request latencies in seconds.")
.init()
- .bind(HANDLER_ALL.as_ref()),
+ .bind(&[]),
},
}
}
@@ -125,7 +118,7 @@ impl AdminServer {
let addr = &garage.config.admin_api.bind_addr;
- let server = Server::bind(&addr).serve(make_svc);
+ let server = Server::bind(addr).serve(make_svc);
let graceful = server.with_graceful_shutdown(shutdown_signal);
info!("Admin server listening on http://{}", addr);
diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs
index 92aa2edf..12cf946b 100644
--- a/src/garage/tests/common/garage.rs
+++ b/src/garage/tests/common/garage.rs
@@ -65,6 +65,9 @@ root_domain = ".s3.garage"
bind_addr = "127.0.0.1:{web_port}"
root_domain = ".web.garage"
index = "index.html"
+
+[admin_api]
+bind_addr = "127.0.0.1:{admin_port}"
"#,
path = path.display(),
secret = GARAGE_TEST_SECRET,
@@ -72,6 +75,7 @@ index = "index.html"
api_port = port,
rpc_port = port + 1,
web_port = port + 2,
+ admin_port = port + 3,
);
fs::write(path.join("config.toml"), config).expect("Could not write garage config file");
diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml
index 14e49557..10a4c838 100644
--- a/src/model/Cargo.toml
+++ b/src/model/Cargo.toml
@@ -36,6 +36,7 @@ serde_bytes = "0.11"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
+opentelemetry = "0.17"
#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
netapp = "0.3.0"
diff --git a/src/model/block.rs b/src/model/block.rs
index 1173c7b3..9e939c24 100644
--- a/src/model/block.rs
+++ b/src/model/block.rs
@@ -1,12 +1,13 @@
use std::convert::TryInto;
use std::path::{Path, PathBuf};
use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, SystemTime};
use arc_swap::ArcSwapOption;
use async_trait::async_trait;
use futures::future::*;
use futures::select;
+use opentelemetry::KeyValue;
use serde::{Deserialize, Serialize};
use tokio::fs;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
@@ -23,8 +24,8 @@ use garage_rpc::*;
use garage_table::replication::{TableReplication, TableShardedReplication};
+use crate::block_metrics::*;
use crate::block_ref_table::*;
-
use crate::garage::Garage;
/// Size under which data will be stored inlined in database instead of as files
@@ -154,6 +155,8 @@ pub struct BlockManager {
system: Arc<System>,
endpoint: Arc<Endpoint<BlockRpc, Self>>,
pub(crate) garage: ArcSwapOption<Garage>,
+
+ metrics: BlockManagerMetrics,
}
// This custom struct contains functions that must only be ran
@@ -182,6 +185,8 @@ impl BlockManager {
let manager_locked = BlockManagerLocked();
+ let metrics = BlockManagerMetrics::new(resync_queue.clone());
+
let block_manager = Arc::new(Self {
replication,
data_dir,
@@ -192,6 +197,7 @@ impl BlockManager {
system,
endpoint,
garage: ArcSwapOption::from(None),
+ metrics,
});
block_manager.endpoint.set_handler(block_manager.clone());
@@ -380,15 +386,28 @@ impl BlockManager {
/// Write a block to disk
async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<BlockRpc, Error> {
- self.mutation_lock
+ let request_start = SystemTime::now();
+ let write_size = data.inner_buffer().len() as u64;
+
+ let res = self
+ .mutation_lock
.lock()
.await
.write_block(hash, data, self)
- .await
+ .await?;
+
+ self.metrics.bytes_written.add(write_size);
+ self.metrics
+ .block_write_duration
+ .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+
+ Ok(res)
}
/// Read block from disk, verifying it's integrity
async fn read_block(&self, hash: &Hash) -> Result<BlockRpc, Error> {
+ let request_start = SystemTime::now();
+
let mut path = self.block_path(hash);
let compressed = match self.is_block_compressed(hash).await {
Ok(c) => c,
@@ -414,6 +433,8 @@ impl BlockManager {
};
if data.verify(*hash).is_err() {
+ self.metrics.corruption_counter.add(1);
+
self.mutation_lock
.lock()
.await
@@ -423,6 +444,13 @@ impl BlockManager {
return Err(Error::CorruptData(*hash));
}
+ self.metrics
+ .bytes_read
+ .add(data.inner_buffer().len() as u64);
+ self.metrics
+ .block_read_duration
+ .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+
Ok(BlockRpc::PutBlock { hash: *hash, data })
}
@@ -521,9 +549,18 @@ impl BlockManager {
let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap());
let now = now_msec();
if now >= time_msec {
+ let start_time = SystemTime::now();
+
let hash = Hash::try_from(&hash_bytes[..]).unwrap();
let res = self.resync_block(&hash).await;
+
+ self.metrics.resync_counter.add(1);
+ self.metrics
+ .resync_duration
+ .record(start_time.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+
if let Err(e) = &res {
+ self.metrics.resync_error_counter.add(1);
warn!("Error when resyncing {:?}: {}", hash, e);
self.put_to_resync(&hash, RESYNC_RETRY_DELAY)?;
}
@@ -607,6 +644,12 @@ impl BlockManager {
need_nodes.len()
);
+ for node in need_nodes.iter() {
+ self.metrics
+ .resync_send_counter
+ .add(1, &[KeyValue::new("to", format!("{:?}", node))]);
+ }
+
let put_block_message = self.read_block(hash).await?;
self.system
.rpc
@@ -644,6 +687,9 @@ impl BlockManager {
);
let block_data = self.rpc_get_raw_block(hash).await?;
+
+ self.metrics.resync_recv_counter.add(1);
+
self.write_block(hash, &block_data).await?;
}
@@ -819,6 +865,7 @@ impl BlockManagerLocked {
path.set_extension("zst");
}
fs::remove_file(path).await?;
+ mgr.metrics.delete_counter.add(1);
}
Ok(())
}
diff --git a/src/model/block_metrics.rs b/src/model/block_metrics.rs
new file mode 100644
index 00000000..7ef9a117
--- /dev/null
+++ b/src/model/block_metrics.rs
@@ -0,0 +1,93 @@
+use opentelemetry::{global, metrics::*};
+
+/// TableMetrics reference all counter used for metrics
+pub struct BlockManagerMetrics {
+ pub(crate) _resync_queue_len: ValueObserver<u64>,
+
+ pub(crate) resync_counter: BoundCounter<u64>,
+ pub(crate) resync_error_counter: BoundCounter<u64>,
+ pub(crate) resync_duration: BoundValueRecorder<f64>,
+ pub(crate) resync_send_counter: Counter<u64>,
+ pub(crate) resync_recv_counter: BoundCounter<u64>,
+
+ pub(crate) bytes_read: BoundCounter<u64>,
+ pub(crate) block_read_duration: BoundValueRecorder<f64>,
+ pub(crate) bytes_written: BoundCounter<u64>,
+ pub(crate) block_write_duration: BoundValueRecorder<f64>,
+ pub(crate) delete_counter: BoundCounter<u64>,
+
+ pub(crate) corruption_counter: BoundCounter<u64>,
+}
+
+impl BlockManagerMetrics {
+ pub fn new(resync_queue: sled::Tree) -> Self {
+ let meter = global::meter("garage_model/block");
+ Self {
+ _resync_queue_len: meter
+ .u64_value_observer("block.resync_queue_length", move |observer| {
+ observer.observe(resync_queue.len() as u64, &[])
+ })
+ .with_description(
+ "Number of block hashes queued for local check and possible resync",
+ )
+ .init(),
+
+ resync_counter: meter
+ .u64_counter("block.resync_counter")
+ .with_description("Number of calls to resync_block")
+ .init()
+ .bind(&[]),
+ resync_error_counter: meter
+ .u64_counter("block.resync_error_counter")
+ .with_description("Number of calls to resync_block that returned an error")
+ .init()
+ .bind(&[]),
+ resync_duration: meter
+ .f64_value_recorder("block.resync_duration")
+ .with_description("Duration of resync_block operations")
+ .init()
+ .bind(&[]),
+ resync_send_counter: meter
+ .u64_counter("block.resync_send_counter")
+ .with_description("Number of blocks sent to another node in resync operations")
+ .init(),
+ resync_recv_counter: meter
+ .u64_counter("block.resync_recv_counter")
+ .with_description("Number of blocks received from other nodes in resync operations")
+ .init()
+ .bind(&[]),
+
+ bytes_read: meter
+ .u64_counter("block.bytes_read")
+ .with_description("Number of bytes read from disk")
+ .init()
+ .bind(&[]),
+ block_read_duration: meter
+ .f64_value_recorder("block.read_duration")
+ .with_description("Duration of block read operations")
+ .init()
+ .bind(&[]),
+ bytes_written: meter
+ .u64_counter("block.bytes_written")
+ .with_description("Number of bytes written to disk")
+ .init()
+ .bind(&[]),
+ block_write_duration: meter
+ .f64_value_recorder("block.write_duration")
+ .with_description("Duration of block write operations")
+ .init()
+ .bind(&[]),
+ delete_counter: meter
+ .u64_counter("block.delete_counter")
+ .with_description("Number of blocks deleted")
+ .init()
+ .bind(&[]),
+
+ corruption_counter: meter
+ .u64_counter("block.corruption_counter")
+ .with_description("Data corruptions detected on block reads")
+ .init()
+ .bind(&[]),
+ }
+ }
+}
diff --git a/src/model/lib.rs b/src/model/lib.rs
index 9deaae9d..c8677603 100644
--- a/src/model/lib.rs
+++ b/src/model/lib.rs
@@ -11,6 +11,7 @@ pub mod object_table;
pub mod version_table;
pub mod block;
+mod block_metrics;
pub mod garage;
pub mod helper;
diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml
index f06606e5..57b61a08 100644
--- a/src/rpc/Cargo.toml
+++ b/src/rpc/Cargo.toml
@@ -43,8 +43,9 @@ futures = "0.3"
futures-util = "0.3"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
tokio-stream = { version = "0.1", features = ["net"] }
+opentelemetry = "0.17"
#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-netapp = "0.3.0"
+netapp = "0.3.1"
hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] }
diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs
index 2c877a7f..736513f4 100644
--- a/src/rpc/lib.rs
+++ b/src/rpc/lib.rs
@@ -10,6 +10,7 @@ pub mod layout;
pub mod ring;
pub mod system;
+mod metrics;
pub mod rpc_helper;
pub use rpc_helper::*;
diff --git a/src/rpc/metrics.rs b/src/rpc/metrics.rs
new file mode 100644
index 00000000..c900518c
--- /dev/null
+++ b/src/rpc/metrics.rs
@@ -0,0 +1,55 @@
+use std::sync::Arc;
+
+use opentelemetry::{global, metrics::*};
+use tokio::sync::Semaphore;
+
+/// TableMetrics reference all counter used for metrics
+pub struct RpcMetrics {
+ pub(crate) _rpc_available_permits: ValueObserver<u64>,
+
+ pub(crate) rpc_counter: Counter<u64>,
+ pub(crate) rpc_timeout_counter: Counter<u64>,
+ pub(crate) rpc_netapp_error_counter: Counter<u64>,
+ pub(crate) rpc_garage_error_counter: Counter<u64>,
+
+ pub(crate) rpc_duration: ValueRecorder<f64>,
+ pub(crate) rpc_queueing_time: ValueRecorder<f64>,
+}
+impl RpcMetrics {
+ pub fn new(sem: Arc<Semaphore>) -> Self {
+ let meter = global::meter("garage_rpc");
+ RpcMetrics {
+ _rpc_available_permits: meter
+ .u64_value_observer("rpc.available_permits", move |observer| {
+ observer.observe(sem.available_permits() as u64, &[])
+ })
+ .with_description("Number of available RPC permits")
+ .init(),
+
+ rpc_counter: meter
+ .u64_counter("rpc.request_counter")
+ .with_description("Number of RPC requests emitted")
+ .init(),
+ rpc_timeout_counter: meter
+ .u64_counter("rpc.timeout_counter")
+ .with_description("Number of RPC timeouts")
+ .init(),
+ rpc_netapp_error_counter: meter
+ .u64_counter("rpc.netapp_error_counter")
+ .with_description("Number of communication errors (errors in the Netapp library)")
+ .init(),
+ rpc_garage_error_counter: meter
+ .u64_counter("rpc.garage_error_counter")
+ .with_description("Number of RPC errors (errors happening when handling the RPC)")
+ .init(),
+ rpc_duration: meter
+ .f64_value_recorder("rpc.duration")
+ .with_description("Duration of RPCs")
+ .init(),
+ rpc_queueing_time: meter
+ .f64_value_recorder("rpc.queueing_time")
+ .with_description("Time RPC requests were queued for before being sent")
+ .init(),
+ }
+ }
+}
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index 68bdfc4f..0d722e43 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -1,11 +1,12 @@
//! Contain structs related to making RPCs
use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, SystemTime};
use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt;
use futures_util::future::FutureExt;
+use opentelemetry::KeyValue;
use tokio::select;
use tokio::sync::{watch, Semaphore};
@@ -18,6 +19,7 @@ use garage_util::background::BackgroundRunner;
use garage_util::data::*;
use garage_util::error::Error;
+use crate::metrics::RpcMetrics;
use crate::ring::Ring;
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
@@ -76,7 +78,8 @@ struct RpcHelperInner {
fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>,
- request_buffer_semaphore: Semaphore,
+ request_buffer_semaphore: Arc<Semaphore>,
+ metrics: RpcMetrics,
}
impl RpcHelper {
@@ -86,12 +89,17 @@ impl RpcHelper {
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>,
) -> Self {
+ let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE));
+
+ let metrics = RpcMetrics::new(sem.clone());
+
Self(Arc::new(RpcHelperInner {
our_node_id,
fullmesh,
background,
ring,
- request_buffer_semaphore: Semaphore::new(REQUEST_BUFFER_SIZE),
+ request_buffer_semaphore: sem,
+ metrics,
}))
}
@@ -120,6 +128,9 @@ impl RpcHelper {
M: Rpc<Response = Result<S, Error>>,
H: EndpointHandler<M>,
{
+ let queueing_start_time = SystemTime::now();
+ let metric_tags = [KeyValue::new("endpoint", endpoint.path().to_string())];
+
let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32;
let permit = self
.0
@@ -127,14 +138,36 @@ impl RpcHelper {
.acquire_many(msg_size)
.await?;
+ self.0.metrics.rpc_queueing_time.record(
+ queueing_start_time
+ .elapsed()
+ .map_or(0.0, |d| d.as_secs_f64()),
+ &metric_tags,
+ );
+ self.0.metrics.rpc_counter.add(1, &metric_tags);
+ let rpc_start_time = SystemTime::now();
+
let node_id = to.into();
select! {
res = endpoint.call(&node_id, &msg, strat.rs_priority) => {
drop(permit);
- Ok(res??)
+
+ if res.is_err() {
+ self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
+ }
+ let res = res?;
+
+ self.0.metrics.rpc_duration
+ .record(rpc_start_time.elapsed().map_or(0.0, |d| d.as_secs_f64()), &metric_tags);
+ if res.is_err() {
+ self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
+ }
+
+ Ok(res?)
}
_ = tokio::time::sleep(strat.rs_timeout) => {
drop(permit);
+ self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
Err(Error::Timeout)
}
}
diff --git a/src/table/data.rs b/src/table/data.rs
index a5209c26..684afdcd 100644
--- a/src/table/data.rs
+++ b/src/table/data.rs
@@ -54,7 +54,7 @@ where
.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
.expect("Unable to open DB tree");
- let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone());
+ let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone(), gc_todo.clone());
Arc::new(Self {
system,
@@ -171,6 +171,8 @@ where
})?;
if let Some((old_entry, new_entry, new_bytes_hash)) = changed {
+ self.metrics.internal_update_counter.add(1);
+
let is_tombstone = new_entry.is_tombstone();
self.instance.updated(old_entry, Some(new_entry));
self.merkle_todo_notify.notify_one();
@@ -205,6 +207,8 @@ where
})?;
if removed {
+ self.metrics.internal_delete_counter.add(1);
+
let old_entry = self.decode_entry(v)?;
self.instance.updated(Some(old_entry), None);
self.merkle_todo_notify.notify_one();
diff --git a/src/table/metrics.rs b/src/table/metrics.rs
index 38e93904..548bf0d6 100644
--- a/src/table/metrics.rs
+++ b/src/table/metrics.rs
@@ -2,15 +2,27 @@ use opentelemetry::{global, metrics::*, KeyValue};
/// TableMetrics reference all counter used for metrics
pub struct TableMetrics {
- merkle_updater_todo_queue_length: ValueObserver<u64>,
+ pub(crate) _merkle_todo_len: ValueObserver<u64>,
+ pub(crate) _gc_todo_len: ValueObserver<u64>,
+
+ pub(crate) get_request_counter: BoundCounter<u64>,
+ pub(crate) get_request_duration: BoundValueRecorder<f64>,
+ pub(crate) put_request_counter: BoundCounter<u64>,
+ pub(crate) put_request_duration: BoundValueRecorder<f64>,
+
+ pub(crate) internal_update_counter: BoundCounter<u64>,
+ pub(crate) internal_delete_counter: BoundCounter<u64>,
+
+ pub(crate) sync_items_sent: Counter<u64>,
+ pub(crate) sync_items_received: Counter<u64>,
}
impl TableMetrics {
- pub fn new(table_name: &'static str, merkle_todo: sled::Tree) -> Self {
+ pub fn new(table_name: &'static str, merkle_todo: sled::Tree, gc_todo: sled::Tree) -> Self {
let meter = global::meter(table_name);
TableMetrics {
- merkle_updater_todo_queue_length: meter
+ _merkle_todo_len: meter
.u64_value_observer(
- format!("merkle_updater_todo_queue_length"),
+ "table.merkle_updater_todo_queue_length",
move |observer| {
observer.observe(
merkle_todo.len() as u64,
@@ -18,7 +30,60 @@ impl TableMetrics {
)
},
)
- .with_description("Bucket merkle updater TODO queue length")
+ .with_description("Merkle tree updater TODO queue length")
+ .init(),
+ _gc_todo_len: meter
+ .u64_value_observer(
+ "table.gc_todo_queue_length",
+ move |observer| {
+ observer.observe(
+ gc_todo.len() as u64,
+ &[KeyValue::new("table_name", table_name)],
+ )
+ },
+ )
+ .with_description("Table garbage collector TODO queue length")
+ .init(),
+
+ get_request_counter: meter
+ .u64_counter("table.get_request_counter")
+ .with_description("Number of get/get_range requests internally made on this table")
+ .init()
+ .bind(&[KeyValue::new("table_name", table_name)]),
+ get_request_duration: meter
+ .f64_value_recorder("table.get_request_duration")
+ .with_description("Duration of get/get_range requests internally made on this table, in seconds")
+ .init()
+ .bind(&[KeyValue::new("table_name", table_name)]),
+ put_request_counter: meter
+ .u64_counter("table.put_request_counter")
+ .with_description("Number of insert/insert_many requests internally made on this table")
+ .init()
+ .bind(&[KeyValue::new("table_name", table_name)]),
+ put_request_duration: meter
+ .f64_value_recorder("table.put_request_duration")
+ .with_description("Duration of insert/insert_many requests internally made on this table, in seconds")
+ .init()
+ .bind(&[KeyValue::new("table_name", table_name)]),
+
+ internal_update_counter: meter
+ .u64_counter("table.internal_update_counter")
+ .with_description("Number of value updates where the value actually changes (includes creation of new key and update of existing key)")
+ .init()
+ .bind(&[KeyValue::new("table_name", table_name)]),
+ internal_delete_counter: meter
+ .u64_counter("table.internal_delete_counter")
+ .with_description("Number of value deletions in the tree (due to GC or repartitioning)")
+ .init()
+ .bind(&[KeyValue::new("table_name", table_name)]),
+
+ sync_items_sent: meter
+ .u64_counter("table.sync_items_sent")
+ .with_description("Number of data items sent to other nodes during resync procedures")
+ .init(),
+ sync_items_received: meter
+ .u64_counter("table.sync_items_received")
+ .with_description("Number of data items received from other nodes during resync procedures")
.init(),
}
}
diff --git a/src/table/sync.rs b/src/table/sync.rs
index 1df2b01d..08069ad0 100644
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@@ -6,6 +6,7 @@ use async_trait::async_trait;
use futures::select;
use futures_util::future::*;
use futures_util::stream::*;
+use opentelemetry::KeyValue;
use rand::Rng;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
@@ -312,6 +313,16 @@ where
) -> Result<(), Error> {
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
+ for to in nodes.iter() {
+ self.data.metrics.sync_items_sent.add(
+ values.len() as u64,
+ &[
+ KeyValue::new("table_name", F::TABLE_NAME),
+ KeyValue::new("to", format!("{:?}", to)),
+ ],
+ );
+ }
+
self.system
.rpc
.try_call_many(
@@ -500,6 +511,14 @@ where
.map(|x| Arc::new(ByteBuf::from(x)))
.collect::<Vec<_>>();
+ self.data.metrics.sync_items_sent.add(
+ values.len() as u64,
+ &[
+ KeyValue::new("table_name", F::TABLE_NAME),
+ KeyValue::new("to", format!("{:?}", who)),
+ ],
+ );
+
let rpc_resp = self
.system
.rpc
@@ -527,7 +546,7 @@ where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
- async fn handle(self: &Arc<Self>, message: &SyncRpc, _from: NodeID) -> Result<SyncRpc, Error> {
+ async fn handle(self: &Arc<Self>, message: &SyncRpc, from: NodeID) -> Result<SyncRpc, Error> {
match message {
SyncRpc::RootCkHash(range, h) => {
let (_root_ck_key, root_ck) = self.get_root_ck(*range)?;
@@ -539,6 +558,17 @@ where
Ok(SyncRpc::Node(k.clone(), node))
}
SyncRpc::Items(items) => {
+ self.data.metrics.sync_items_received.add(
+ items.len() as u64,
+ &[
+ KeyValue::new("table_name", F::TABLE_NAME),
+ KeyValue::new(
+ "from",
+ format!("{:?}", Uuid::try_from(from.as_ref()).unwrap()),
+ ),
+ ],
+ );
+
self.data.update_many(items)?;
Ok(SyncRpc::Ok)
}
diff --git a/src/table/table.rs b/src/table/table.rs
index 01789c11..3ac3bc5b 100644
--- a/src/table/table.rs
+++ b/src/table/table.rs
@@ -1,6 +1,6 @@
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, SystemTime};
use async_trait::async_trait;
use futures::stream::*;
@@ -81,6 +81,8 @@ where
}
pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
+ let request_start = SystemTime::now();
+
let hash = e.partition_key().hash();
let who = self.data.replication.write_nodes(&hash);
//eprintln!("insert who: {:?}", who);
@@ -99,10 +101,18 @@ where
.with_timeout(TABLE_RPC_TIMEOUT),
)
.await?;
+
+ self.data.metrics.put_request_counter.add(1);
+ self.data
+ .metrics
+ .put_request_duration
+ .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
Ok(())
}
pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> {
+ let request_start = SystemTime::now();
+
let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
for entry in entries.iter() {
@@ -140,6 +150,12 @@ where
if errors.len() > self.data.replication.max_write_errors() {
Err(Error::Message("Too many errors".into()))
} else {
+ self.data.metrics.put_request_counter.add(1);
+ self.data
+ .metrics
+ .put_request_duration
+ .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+
Ok(())
}
}
@@ -149,6 +165,8 @@ where
partition_key: &F::P,
sort_key: &F::S,
) -> Result<Option<F::E>, Error> {
+ let request_start = SystemTime::now();
+
let hash = partition_key.hash();
let who = self.data.replication.read_nodes(&hash);
//eprintln!("get who: {:?}", who);
@@ -198,6 +216,12 @@ where
.spawn_cancellable(async move { self2.repair_on_read(&who[..], ent2).await });
}
}
+
+ self.data.metrics.get_request_counter.add(1);
+ self.data
+ .metrics
+ .get_request_duration
+ .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
Ok(ret)
}
@@ -208,6 +232,8 @@ where
filter: Option<F::Filter>,
limit: usize,
) -> Result<Vec<F::E>, Error> {
+ let request_start = SystemTime::now();
+
let hash = partition_key.hash();
let who = self.data.replication.read_nodes(&hash);
@@ -265,6 +291,12 @@ where
.take(limit)
.map(|(_k, v)| v.take().unwrap())
.collect::<Vec<_>>();
+
+ self.data.metrics.get_request_counter.add(1);
+ self.data
+ .metrics
+ .get_request_duration
+ .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
Ok(ret_vec)
}