aboutsummaryrefslogtreecommitdiff
path: root/src/rpc/rpc_helper.rs
diff options
context:
space:
mode:
authorAlex <alex@adnab.me>2024-04-10 15:23:12 +0000
committerAlex <alex@adnab.me>2024-04-10 15:23:12 +0000
commit1779fd40c0fe676bedda0d40f647d7fe8b0f1e7e (patch)
tree47e42c4e6ae47590fbb5c8f94e90a23bf04c1674 /src/rpc/rpc_helper.rs
parentb47706809cc9d28d1328bafdf9756e96388cca24 (diff)
parentff093ddbb8485409f389abe7b5e569cb38d222d2 (diff)
downloadgarage-1779fd40c0fe676bedda0d40f647d7fe8b0f1e7e.tar.gz
garage-1779fd40c0fe676bedda0d40f647d7fe8b0f1e7e.zip
Merge pull request 'Garage v1.0' (#683) from next-0.10 into main
Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/683
Diffstat (limited to 'src/rpc/rpc_helper.rs')
-rw-r--r--src/rpc/rpc_helper.rs509
1 files changed, 369 insertions, 140 deletions
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index a1b7951c..ea3e5e76 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -1,12 +1,12 @@
//! Contain structs related to making RPCs
-use std::sync::Arc;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
use std::time::Duration;
use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt;
use tokio::select;
-use tokio::sync::watch;
use opentelemetry::KeyValue;
use opentelemetry::{
@@ -26,8 +26,8 @@ use garage_util::data::*;
use garage_util::error::Error;
use garage_util::metrics::RecordDuration;
+use crate::layout::{LayoutHelper, LayoutVersion};
use crate::metrics::RpcMetrics;
-use crate::ring::Ring;
// Default RPC timeout = 5 minutes
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
@@ -35,11 +35,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
/// Strategy to apply when making RPC
pub struct RequestStrategy<T> {
/// Min number of response to consider the request successful
- pub rs_quorum: Option<usize>,
- /// Should requests be dropped after enough response are received
- pub rs_interrupt_after_quorum: bool,
+ rs_quorum: Option<usize>,
+ /// Send all requests at once
+ rs_send_all_at_once: Option<bool>,
/// Request priority
- pub rs_priority: RequestPriority,
+ rs_priority: RequestPriority,
/// Custom timeout for this request
rs_timeout: Timeout,
/// Data to drop when everything completes
@@ -57,7 +57,7 @@ impl Clone for RequestStrategy<()> {
fn clone(&self) -> Self {
RequestStrategy {
rs_quorum: self.rs_quorum,
- rs_interrupt_after_quorum: self.rs_interrupt_after_quorum,
+ rs_send_all_at_once: self.rs_send_all_at_once,
rs_priority: self.rs_priority,
rs_timeout: self.rs_timeout,
rs_drop_on_complete: (),
@@ -70,7 +70,7 @@ impl RequestStrategy<()> {
pub fn with_priority(prio: RequestPriority) -> Self {
RequestStrategy {
rs_quorum: None,
- rs_interrupt_after_quorum: false,
+ rs_send_all_at_once: None,
rs_priority: prio,
rs_timeout: Timeout::Default,
rs_drop_on_complete: (),
@@ -80,7 +80,7 @@ impl RequestStrategy<()> {
pub fn with_drop_on_completion<T>(self, drop_on_complete: T) -> RequestStrategy<T> {
RequestStrategy {
rs_quorum: self.rs_quorum,
- rs_interrupt_after_quorum: self.rs_interrupt_after_quorum,
+ rs_send_all_at_once: self.rs_send_all_at_once,
rs_priority: self.rs_priority,
rs_timeout: self.rs_timeout,
rs_drop_on_complete: drop_on_complete,
@@ -94,10 +94,9 @@ impl<T> RequestStrategy<T> {
self.rs_quorum = Some(quorum);
self
}
- /// Set if requests can be dropped after quorum has been reached
- /// In general true for read requests, and false for write
- pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self {
- self.rs_interrupt_after_quorum = interrupt;
+ /// Set quorum to be reached for request
+ pub fn send_all_at_once(mut self, value: bool) -> Self {
+ self.rs_send_all_at_once = Some(value);
self
}
/// Deactivate timeout for this request
@@ -115,7 +114,7 @@ impl<T> RequestStrategy<T> {
(
RequestStrategy {
rs_quorum: self.rs_quorum,
- rs_interrupt_after_quorum: self.rs_interrupt_after_quorum,
+ rs_send_all_at_once: self.rs_send_all_at_once,
rs_priority: self.rs_priority,
rs_timeout: self.rs_timeout,
rs_drop_on_complete: (),
@@ -131,7 +130,7 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
struct RpcHelperInner {
our_node_id: Uuid,
peering: Arc<PeeringManager>,
- ring: watch::Receiver<Arc<Ring>>,
+ layout: Arc<RwLock<LayoutHelper>>,
metrics: RpcMetrics,
rpc_timeout: Duration,
}
@@ -140,7 +139,7 @@ impl RpcHelper {
pub(crate) fn new(
our_node_id: Uuid,
peering: Arc<PeeringManager>,
- ring: watch::Receiver<Arc<Ring>>,
+ layout: Arc<RwLock<LayoutHelper>>,
rpc_timeout: Option<Duration>,
) -> Self {
let metrics = RpcMetrics::new();
@@ -148,7 +147,7 @@ impl RpcHelper {
Self(Arc::new(RpcHelperInner {
our_node_id,
peering,
- ring,
+ layout,
metrics,
rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
}))
@@ -170,6 +169,12 @@ impl RpcHelper {
N: IntoReq<M> + Send,
H: StreamingEndpointHandler<M>,
{
+ let tracer = opentelemetry::global::tracer("garage");
+ let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to);
+ let mut span = tracer.start(span_name);
+ span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
+ span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
+
let metric_tags = [
KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
@@ -181,6 +186,7 @@ impl RpcHelper {
let node_id = to.into();
let rpc_call = endpoint
.call_streaming(&node_id, msg, strat.rs_priority)
+ .with_context(Context::current_with_span(span))
.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
let timeout = async {
@@ -223,12 +229,17 @@ impl RpcHelper {
N: IntoReq<M>,
H: StreamingEndpointHandler<M>,
{
+ let tracer = opentelemetry::global::tracer("garage");
+ let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len());
+ let span = tracer.start(span_name);
+
let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
let resps = join_all(
to.iter()
.map(|to| self.call(endpoint, *to, msg.clone(), strat.clone())),
)
+ .with_context(Context::current_with_span(span))
.await;
Ok(to
.iter()
@@ -260,53 +271,61 @@ impl RpcHelper {
/// Make a RPC call to multiple servers, returning either a Vec of responses,
/// or an error if quorum could not be reached due to too many errors
- pub async fn try_call_many<M, N, H, S, T>(
+ ///
+ /// If RequestStrategy has send_all_at_once set, then all requests will be
+ /// sent at once, and `try_call_many` will return as soon as a quorum of
+ /// responses is achieved, dropping and cancelling the remaining requests.
+ ///
+ /// Otherwise, `quorum` requests will be sent at the same time, and if an
+ /// error response is received, a new request will be sent to replace it.
+ /// The ordering of nodes to which requests are sent is determined by
+ /// the `RpcHelper::request_order` function, which takes into account
+ /// parameters such as node zones and measured ping values.
+ ///
+ /// In both cases, the basic contract of this function is that even in the
+ /// absence of failures, the RPC call might not be driven to completion
+ /// on all of the specified nodes. It is therefore unfit for broadcast
+ /// write operations where we expect all nodes to successfully store
+ /// the written date.
+ pub async fn try_call_many<M, N, H, S>(
&self,
endpoint: &Arc<Endpoint<M, H>>,
to: &[Uuid],
msg: N,
- strategy: RequestStrategy<T>,
+ strategy: RequestStrategy<()>,
) -> Result<Vec<S>, Error>
where
M: Rpc<Response = Result<S, Error>> + 'static,
N: IntoReq<M>,
H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static,
- T: Send + 'static,
{
let quorum = strategy.rs_quorum.unwrap_or(to.len());
let tracer = opentelemetry::global::tracer("garage");
- let span_name = if strategy.rs_interrupt_after_quorum {
- format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len())
- } else {
- format!(
- "RPC {} to {} (quorum {})",
- endpoint.path(),
- to.len(),
- quorum
- )
- };
+ let span_name = format!(
+ "RPC [{}] try_call_many (quorum {}/{})",
+ endpoint.path(),
+ quorum,
+ to.len()
+ );
+
let mut span = tracer.start(span_name);
span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
span.set_attribute(KeyValue::new("quorum", quorum as i64));
- span.set_attribute(KeyValue::new(
- "interrupt_after_quorum",
- strategy.rs_interrupt_after_quorum.to_string(),
- ));
- self.try_call_many_internal(endpoint, to, msg, strategy, quorum)
+ self.try_call_many_inner(endpoint, to, msg, strategy, quorum)
.with_context(Context::current_with_span(span))
.await
}
- async fn try_call_many_internal<M, N, H, S, T>(
+ async fn try_call_many_inner<M, N, H, S>(
&self,
endpoint: &Arc<Endpoint<M, H>>,
to: &[Uuid],
msg: N,
- strategy: RequestStrategy<T>,
+ strategy: RequestStrategy<()>,
quorum: usize,
) -> Result<Vec<S>, Error>
where
@@ -314,135 +333,244 @@ impl RpcHelper {
N: IntoReq<M>,
H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static,
- T: Send + 'static,
{
- let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
+ // Once quorum is reached, other requests don't matter.
+ // What we do here is only send the required number of requests
+ // to reach a quorum, priorizing nodes with the lowest latency.
+ // When there are errors, we start new requests to compensate.
- let (strategy, drop_on_complete) = strategy.extract_drop_on_complete();
+ // TODO: this could be made more aggressive, e.g. if after 2x the
+ // average ping of a given request, the response is not yet received,
+ // preemptively send an additional request to any remaining nodes.
+
+ // Reorder requests to priorize closeness / low latency
+ let request_order =
+ self.request_order(&self.0.layout.read().unwrap().current(), to.iter().copied());
+ let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);
// Build future for each request
// They are not started now: they are added below in a FuturesUnordered
// object that will take care of polling them (see below)
- let requests = to.iter().cloned().map(|to| {
+ let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
+ let mut requests = request_order.into_iter().map(|to| {
let self2 = self.clone();
let msg = msg.clone();
let endpoint2 = endpoint.clone();
let strategy = strategy.clone();
- (to, async move {
- self2.call(&endpoint2, to, msg, strategy).await
- })
+ async move { self2.call(&endpoint2, to, msg, strategy).await }
});
// Vectors in which success results and errors will be collected
let mut successes = vec![];
let mut errors = vec![];
- if strategy.rs_interrupt_after_quorum {
- // Case 1: once quorum is reached, other requests don't matter.
- // What we do here is only send the required number of requests
- // to reach a quorum, priorizing nodes with the lowest latency.
- // When there are errors, we start new requests to compensate.
-
- // Reorder requests to priorize closeness / low latency
- let request_order = self.request_order(to);
- let mut ord_requests = vec![(); request_order.len()]
- .into_iter()
- .map(|_| None)
- .collect::<Vec<_>>();
- for (to, fut) in requests {
- let i = request_order.iter().position(|x| *x == to).unwrap();
- ord_requests[i] = Some((to, fut));
+ // resp_stream will contain all of the requests that are currently in flight.
+ // (for the moment none, they will be added in the loop below)
+ let mut resp_stream = FuturesUnordered::new();
+
+ // Do some requests and collect results
+ while successes.len() < quorum {
+ // If the current set of requests that are running is not enough to possibly
+ // reach quorum, start some new requests.
+ while send_all_at_once || successes.len() + resp_stream.len() < quorum {
+ if let Some(fut) = requests.next() {
+ resp_stream.push(fut)
+ } else {
+ break;
+ }
+ }
+
+ if successes.len() + resp_stream.len() < quorum {
+ // We know we won't ever reach quorum
+ break;
}
- // Make an iterator to take requests in their sorted order
- let mut requests = ord_requests.into_iter().map(Option::unwrap);
-
- // resp_stream will contain all of the requests that are currently in flight.
- // (for the moment none, they will be added in the loop below)
- let mut resp_stream = FuturesUnordered::new();
-
- // Do some requests and collect results
- 'request_loop: while successes.len() < quorum {
- // If the current set of requests that are running is not enough to possibly
- // reach quorum, start some new requests.
- while successes.len() + resp_stream.len() < quorum {
- if let Some((req_to, fut)) = requests.next() {
- let tracer = opentelemetry::global::tracer("garage");
- let span = tracer.start(format!("RPC to {:?}", req_to));
- resp_stream.push(tokio::spawn(
- fut.with_context(Context::current_with_span(span)),
- ));
- } else {
- // If we have no request to add, we know that we won't ever
- // reach quorum: bail out now.
- break 'request_loop;
- }
+ // Wait for one request to terminate
+ match resp_stream.next().await.unwrap() {
+ Ok(msg) => {
+ successes.push(msg);
}
- assert!(!resp_stream.is_empty()); // because of loop invariants
-
- // Wait for one request to terminate
- match resp_stream.next().await.unwrap().unwrap() {
- Ok(msg) => {
- successes.push(msg);
- }
- Err(e) => {
- errors.push(e);
- }
+ Err(e) => {
+ errors.push(e);
}
}
+ }
+
+ if successes.len() >= quorum {
+ Ok(successes)
} else {
- // Case 2: all of the requests need to be sent in all cases,
- // and need to terminate. (this is the case for writes that
- // must be spread to n nodes)
- // Just start all the requests in parallel and return as soon
- // as the quorum is reached.
- let mut resp_stream = requests
- .map(|(_, fut)| fut)
- .collect::<FuturesUnordered<_>>();
-
- while let Some(resp) = resp_stream.next().await {
- match resp {
- Ok(msg) => {
- successes.push(msg);
- if successes.len() >= quorum {
- break;
- }
- }
- Err(e) => {
- errors.push(e);
- }
- }
- }
+ let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
+ Err(Error::Quorum(
+ quorum,
+ None,
+ successes.len(),
+ to.len(),
+ errors,
+ ))
+ }
+ }
+
+ /// Make a RPC call to multiple servers, returning either a Vec of responses,
+ /// or an error if quorum could not be reached due to too many errors
+ ///
+ /// Contrary to try_call_many, this fuction is especially made for broadcast
+ /// write operations. In particular:
+ ///
+ /// - The request are sent to all specified nodes as soon as `try_write_many_sets`
+ /// is invoked.
+ ///
+ /// - When `try_write_many_sets` returns, all remaining requests that haven't
+ /// completed move to a background task so that they have a chance to
+ /// complete successfully if there are no failures.
+ ///
+ /// In addition, the nodes to which requests should be sent are divided in
+ /// "quorum sets", and `try_write_many_sets` only returns once a quorum
+ /// has been validated in each set. This is used in the case of cluster layout
+ /// changes, where data has to be written both in the old layout and in the
+ /// new one as long as all nodes have not successfully tranisitionned and
+ /// moved all data to the new layout.
+ pub async fn try_write_many_sets<M, N, H, S, T>(
+ &self,
+ endpoint: &Arc<Endpoint<M, H>>,
+ to_sets: &[Vec<Uuid>],
+ msg: N,
+ strategy: RequestStrategy<T>,
+ ) -> Result<Vec<S>, Error>
+ where
+ M: Rpc<Response = Result<S, Error>> + 'static,
+ N: IntoReq<M>,
+ H: StreamingEndpointHandler<M> + 'static,
+ S: Send + 'static,
+ T: Send + 'static,
+ {
+ let quorum = strategy
+ .rs_quorum
+ .expect("internal error: missing quorum value in try_write_many_sets");
+
+ let tracer = opentelemetry::global::tracer("garage");
+ let span_name = format!(
+ "RPC [{}] try_write_many_sets (quorum {} in {} sets)",
+ endpoint.path(),
+ quorum,
+ to_sets.len()
+ );
+
+ let mut span = tracer.start(span_name);
+ span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
+ span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets)));
+ span.set_attribute(KeyValue::new("quorum", quorum as i64));
+
+ self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum)
+ .with_context(Context::current_with_span(span))
+ .await
+ }
- if !resp_stream.is_empty() {
- // Continue remaining requests in background.
- // Note: these requests can get interrupted on process shutdown,
- // we must not count on them being executed for certain.
- // For all background things that have to happen with certainty,
- // they have to be put in a proper queue that is persisted to disk.
+ async fn try_write_many_sets_inner<M, N, H, S, T>(
+ &self,
+ endpoint: &Arc<Endpoint<M, H>>,
+ to_sets: &[Vec<Uuid>],
+ msg: N,
+ strategy: RequestStrategy<T>,
+ quorum: usize,
+ ) -> Result<Vec<S>, Error>
+ where
+ M: Rpc<Response = Result<S, Error>> + 'static,
+ N: IntoReq<M>,
+ H: StreamingEndpointHandler<M> + 'static,
+ S: Send + 'static,
+ T: Send + 'static,
+ {
+ // Peers may appear in many quorum sets. Here, build a list of peers,
+ // mapping to the index of the quorum sets in which they appear.
+ let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum);
+
+ let (strategy, drop_on_complete) = strategy.extract_drop_on_complete();
+
+ // Send one request to each peer of the quorum sets
+ let msg = msg.into_req().map_err(garage_net::error::Error::from)?;
+ let requests = result_tracker.nodes.keys().map(|peer| {
+ let self2 = self.clone();
+ let msg = msg.clone();
+ let endpoint2 = endpoint.clone();
+ let to = *peer;
+ let strategy = strategy.clone();
+ async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }
+ });
+ let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
+
+ // Drive requests to completion
+ while let Some((node, resp)) = resp_stream.next().await {
+ // Store the response in the correct vector and increment the
+ // appropriate counters
+ result_tracker.register_result(node, resp);
+
+ // If we have a quorum of ok in all quorum sets, then it's a success!
+ if result_tracker.all_quorums_ok() {
+ // Continue all other requets in background
tokio::spawn(async move {
- resp_stream.collect::<Vec<Result<_, _>>>().await;
+ resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
drop(drop_on_complete);
});
+
+ return Ok(result_tracker.success_values());
+ }
+
+ // If there is a quorum set for which too many errors were received,
+ // we know it's impossible to get a quorum, so return immediately.
+ if result_tracker.too_many_failures() {
+ break;
}
}
- if successes.len() >= quorum {
- Ok(successes)
- } else {
- let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
- Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
+ // At this point, there is no quorum and we know that a quorum
+ // will never be achieved. Currently, we drop all remaining requests.
+ // Should we still move them to background so that they can continue
+ // for non-failed nodes? Not doing so has no impact on correctness,
+ // but it means that more cancellation messages will be sent. Idk.
+ // (When an in-progress request future is dropped, Netapp automatically
+ // sends a cancellation message to the remote node to inform it that
+ // the result is no longer needed. In turn, if the remote node receives
+ // the cancellation message in time, it interrupts the task of the
+ // running request handler.)
+
+ // Failure, could not get quorum
+ Err(result_tracker.quorum_error())
+ }
+
+ // ---- functions not related to MAKING RPCs, but just determining to what nodes
+ // they should be made and in which order ----
+
+ pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec<Uuid> {
+ let layout = self.0.layout.read().unwrap();
+
+ let mut ret = Vec::with_capacity(12);
+ let ver_iter = layout
+ .versions()
+ .iter()
+ .rev()
+ .chain(layout.inner().old_versions.iter().rev());
+ for ver in ver_iter {
+ if ver.version > layout.sync_map_min() {
+ continue;
+ }
+ let nodes = ver.nodes_of(position, ver.replication_factor);
+ for node in rpc_helper.request_order(layout.current(), nodes) {
+ if !ret.contains(&node) {
+ ret.push(node);
+ }
+ }
}
+ ret
}
- pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> {
+ fn request_order(
+ &self,
+ layout: &LayoutVersion,
+ nodes: impl Iterator<Item = Uuid>,
+ ) -> Vec<Uuid> {
// Retrieve some status variables that we will use to sort requests
let peer_list = self.0.peering.get_peer_list();
- let ring: Arc<Ring> = self.0.ring.borrow().clone();
- let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
- Some(pc) => &pc.zone,
- None => "",
- };
+ let our_zone = layout.get_node_zone(&self.0.our_node_id).unwrap_or("");
// Augment requests with some information used to sort them.
// The tuples are as follows:
@@ -451,22 +579,18 @@ impl RpcHelper {
// By sorting this vec, we priorize ourself, then nodes in the same zone,
// and within a same zone we priorize nodes with the lowest latency.
let mut nodes = nodes
- .iter()
.map(|to| {
- let peer_zone = match ring.layout.node_role(to) {
- Some(pc) => &pc.zone,
- None => "",
- };
+ let peer_zone = layout.get_node_zone(&to).unwrap_or("");
let peer_avg_ping = peer_list
.iter()
.find(|x| x.id.as_ref() == to.as_slice())
.and_then(|pi| pi.avg_ping)
.unwrap_or_else(|| Duration::from_secs(10));
(
- *to != self.0.our_node_id,
+ to != self.0.our_node_id,
peer_zone != our_zone,
peer_avg_ping,
- *to,
+ to,
)
})
.collect::<Vec<_>>();
@@ -480,3 +604,108 @@ impl RpcHelper {
.collect::<Vec<_>>()
}
}
+
+// ------- utility for tracking successes/errors among write sets --------
+
+pub struct QuorumSetResultTracker<S, E> {
+ /// The set of nodes and the index of the quorum sets they belong to
+ pub nodes: HashMap<Uuid, Vec<usize>>,
+ /// The quorum value, i.e. number of success responses to await in each set
+ pub quorum: usize,
+
+ /// The success responses received
+ pub successes: Vec<(Uuid, S)>,
+ /// The error responses received
+ pub failures: Vec<(Uuid, E)>,
+
+ /// The counters for successes in each set
+ pub success_counters: Box<[usize]>,
+ /// The counters for failures in each set
+ pub failure_counters: Box<[usize]>,
+ /// The total number of nodes in each set
+ pub set_lens: Box<[usize]>,
+}
+
+impl<S, E> QuorumSetResultTracker<S, E>
+where
+ E: std::fmt::Display,
+{
+ pub fn new<A>(sets: &[A], quorum: usize) -> Self
+ where
+ A: AsRef<[Uuid]>,
+ {
+ let mut nodes = HashMap::<Uuid, Vec<usize>>::new();
+ for (i, set) in sets.iter().enumerate() {
+ for node in set.as_ref().iter() {
+ nodes.entry(*node).or_default().push(i);
+ }
+ }
+
+ let num_nodes = nodes.len();
+ Self {
+ nodes,
+ quorum,
+ successes: Vec::with_capacity(num_nodes),
+ failures: vec![],
+ success_counters: vec![0; sets.len()].into_boxed_slice(),
+ failure_counters: vec![0; sets.len()].into_boxed_slice(),
+ set_lens: sets
+ .iter()
+ .map(|x| x.as_ref().len())
+ .collect::<Vec<_>>()
+ .into_boxed_slice(),
+ }
+ }
+
+ pub fn register_result(&mut self, node: Uuid, result: Result<S, E>) {
+ match result {
+ Ok(s) => {
+ self.successes.push((node, s));
+ for set in self.nodes.get(&node).unwrap().iter() {
+ self.success_counters[*set] += 1;
+ }
+ }
+ Err(e) => {
+ self.failures.push((node, e));
+ for set in self.nodes.get(&node).unwrap().iter() {
+ self.failure_counters[*set] += 1;
+ }
+ }
+ }
+ }
+
+ pub fn all_quorums_ok(&self) -> bool {
+ self.success_counters
+ .iter()
+ .all(|ok_cnt| *ok_cnt >= self.quorum)
+ }
+
+ pub fn too_many_failures(&self) -> bool {
+ self.failure_counters
+ .iter()
+ .zip(self.set_lens.iter())
+ .any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len)
+ }
+
+ pub fn success_values(self) -> Vec<S> {
+ self.successes
+ .into_iter()
+ .map(|(_, x)| x)
+ .collect::<Vec<_>>()
+ }
+
+ pub fn quorum_error(self) -> Error {
+ let errors = self
+ .failures
+ .iter()
+ .map(|(n, e)| format!("{:?}: {}", n, e))
+ .collect::<Vec<_>>();
+ Error::Quorum(
+ self.quorum,
+ Some(self.set_lens.len()),
+ self.successes.len(),
+ self.nodes.len(),
+ errors,
+ )
+ }
+}