diff options
Diffstat (limited to 'src/rpc/rpc_helper.rs')
-rw-r--r-- | src/rpc/rpc_helper.rs | 509 |
1 files changed, 369 insertions, 140 deletions
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index a1b7951c..ea3e5e76 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,12 +1,12 @@ //! Contain structs related to making RPCs -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; use std::time::Duration; use futures::future::join_all; use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::StreamExt; use tokio::select; -use tokio::sync::watch; use opentelemetry::KeyValue; use opentelemetry::{ @@ -26,8 +26,8 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; +use crate::layout::{LayoutHelper, LayoutVersion}; use crate::metrics::RpcMetrics; -use crate::ring::Ring; // Default RPC timeout = 5 minutes const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); @@ -35,11 +35,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); /// Strategy to apply when making RPC pub struct RequestStrategy<T> { /// Min number of response to consider the request successful - pub rs_quorum: Option<usize>, - /// Should requests be dropped after enough response are received - pub rs_interrupt_after_quorum: bool, + rs_quorum: Option<usize>, + /// Send all requests at once + rs_send_all_at_once: Option<bool>, /// Request priority - pub rs_priority: RequestPriority, + rs_priority: RequestPriority, /// Custom timeout for this request rs_timeout: Timeout, /// Data to drop when everything completes @@ -57,7 +57,7 @@ impl Clone for RequestStrategy<()> { fn clone(&self) -> Self { RequestStrategy { rs_quorum: self.rs_quorum, - rs_interrupt_after_quorum: self.rs_interrupt_after_quorum, + rs_send_all_at_once: self.rs_send_all_at_once, rs_priority: self.rs_priority, rs_timeout: self.rs_timeout, rs_drop_on_complete: (), @@ -70,7 +70,7 @@ impl RequestStrategy<()> { pub fn with_priority(prio: RequestPriority) -> Self { RequestStrategy { rs_quorum: None, - rs_interrupt_after_quorum: false, + rs_send_all_at_once: None, rs_priority: prio, rs_timeout: Timeout::Default, rs_drop_on_complete: (), @@ -80,7 +80,7 @@ impl RequestStrategy<()> { pub fn with_drop_on_completion<T>(self, drop_on_complete: T) -> RequestStrategy<T> { RequestStrategy { rs_quorum: self.rs_quorum, - rs_interrupt_after_quorum: self.rs_interrupt_after_quorum, + rs_send_all_at_once: self.rs_send_all_at_once, rs_priority: self.rs_priority, rs_timeout: self.rs_timeout, rs_drop_on_complete: drop_on_complete, @@ -94,10 +94,9 @@ impl<T> RequestStrategy<T> { self.rs_quorum = Some(quorum); self } - /// Set if requests can be dropped after quorum has been reached - /// In general true for read requests, and false for write - pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self { - self.rs_interrupt_after_quorum = interrupt; + /// Set quorum to be reached for request + pub fn send_all_at_once(mut self, value: bool) -> Self { + self.rs_send_all_at_once = Some(value); self } /// Deactivate timeout for this request @@ -115,7 +114,7 @@ impl<T> RequestStrategy<T> { ( RequestStrategy { rs_quorum: self.rs_quorum, - rs_interrupt_after_quorum: self.rs_interrupt_after_quorum, + rs_send_all_at_once: self.rs_send_all_at_once, rs_priority: self.rs_priority, rs_timeout: self.rs_timeout, rs_drop_on_complete: (), @@ -131,7 +130,7 @@ pub struct RpcHelper(Arc<RpcHelperInner>); struct RpcHelperInner { our_node_id: Uuid, peering: Arc<PeeringManager>, - ring: watch::Receiver<Arc<Ring>>, + layout: Arc<RwLock<LayoutHelper>>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -140,7 +139,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, peering: Arc<PeeringManager>, - ring: watch::Receiver<Arc<Ring>>, + layout: Arc<RwLock<LayoutHelper>>, rpc_timeout: Option<Duration>, ) -> Self { let metrics = RpcMetrics::new(); @@ -148,7 +147,7 @@ impl RpcHelper { Self(Arc::new(RpcHelperInner { our_node_id, peering, - ring, + layout, metrics, rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) @@ -170,6 +169,12 @@ impl RpcHelper { N: IntoReq<M> + Send, H: StreamingEndpointHandler<M>, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to); + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to))); + let metric_tags = [ KeyValue::new("rpc_endpoint", endpoint.path().to_string()), KeyValue::new("from", format!("{:?}", self.0.our_node_id)), @@ -181,6 +186,7 @@ impl RpcHelper { let node_id = to.into(); let rpc_call = endpoint .call_streaming(&node_id, msg, strat.rs_priority) + .with_context(Context::current_with_span(span)) .record_duration(&self.0.metrics.rpc_duration, &metric_tags); let timeout = async { @@ -223,12 +229,17 @@ impl RpcHelper { N: IntoReq<M>, H: StreamingEndpointHandler<M>, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len()); + let span = tracer.start(span_name); + let msg = msg.into_req().map_err(garage_net::error::Error::from)?; let resps = join_all( to.iter() .map(|to| self.call(endpoint, *to, msg.clone(), strat.clone())), ) + .with_context(Context::current_with_span(span)) .await; Ok(to .iter() @@ -260,53 +271,61 @@ impl RpcHelper { /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors - pub async fn try_call_many<M, N, H, S, T>( + /// + /// If RequestStrategy has send_all_at_once set, then all requests will be + /// sent at once, and `try_call_many` will return as soon as a quorum of + /// responses is achieved, dropping and cancelling the remaining requests. + /// + /// Otherwise, `quorum` requests will be sent at the same time, and if an + /// error response is received, a new request will be sent to replace it. + /// The ordering of nodes to which requests are sent is determined by + /// the `RpcHelper::request_order` function, which takes into account + /// parameters such as node zones and measured ping values. + /// + /// In both cases, the basic contract of this function is that even in the + /// absence of failures, the RPC call might not be driven to completion + /// on all of the specified nodes. It is therefore unfit for broadcast + /// write operations where we expect all nodes to successfully store + /// the written date. + pub async fn try_call_many<M, N, H, S>( &self, endpoint: &Arc<Endpoint<M, H>>, to: &[Uuid], msg: N, - strategy: RequestStrategy<T>, + strategy: RequestStrategy<()>, ) -> Result<Vec<S>, Error> where M: Rpc<Response = Result<S, Error>> + 'static, N: IntoReq<M>, H: StreamingEndpointHandler<M> + 'static, S: Send + 'static, - T: Send + 'static, { let quorum = strategy.rs_quorum.unwrap_or(to.len()); let tracer = opentelemetry::global::tracer("garage"); - let span_name = if strategy.rs_interrupt_after_quorum { - format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len()) - } else { - format!( - "RPC {} to {} (quorum {})", - endpoint.path(), - to.len(), - quorum - ) - }; + let span_name = format!( + "RPC [{}] try_call_many (quorum {}/{})", + endpoint.path(), + quorum, + to.len() + ); + let mut span = tracer.start(span_name); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); span.set_attribute(KeyValue::new("to", format!("{:?}", to))); span.set_attribute(KeyValue::new("quorum", quorum as i64)); - span.set_attribute(KeyValue::new( - "interrupt_after_quorum", - strategy.rs_interrupt_after_quorum.to_string(), - )); - self.try_call_many_internal(endpoint, to, msg, strategy, quorum) + self.try_call_many_inner(endpoint, to, msg, strategy, quorum) .with_context(Context::current_with_span(span)) .await } - async fn try_call_many_internal<M, N, H, S, T>( + async fn try_call_many_inner<M, N, H, S>( &self, endpoint: &Arc<Endpoint<M, H>>, to: &[Uuid], msg: N, - strategy: RequestStrategy<T>, + strategy: RequestStrategy<()>, quorum: usize, ) -> Result<Vec<S>, Error> where @@ -314,135 +333,244 @@ impl RpcHelper { N: IntoReq<M>, H: StreamingEndpointHandler<M> + 'static, S: Send + 'static, - T: Send + 'static, { - let msg = msg.into_req().map_err(garage_net::error::Error::from)?; + // Once quorum is reached, other requests don't matter. + // What we do here is only send the required number of requests + // to reach a quorum, priorizing nodes with the lowest latency. + // When there are errors, we start new requests to compensate. - let (strategy, drop_on_complete) = strategy.extract_drop_on_complete(); + // TODO: this could be made more aggressive, e.g. if after 2x the + // average ping of a given request, the response is not yet received, + // preemptively send an additional request to any remaining nodes. + + // Reorder requests to priorize closeness / low latency + let request_order = + self.request_order(&self.0.layout.read().unwrap().current(), to.iter().copied()); + let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request // They are not started now: they are added below in a FuturesUnordered // object that will take care of polling them (see below) - let requests = to.iter().cloned().map(|to| { + let msg = msg.into_req().map_err(garage_net::error::Error::from)?; + let mut requests = request_order.into_iter().map(|to| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); let strategy = strategy.clone(); - (to, async move { - self2.call(&endpoint2, to, msg, strategy).await - }) + async move { self2.call(&endpoint2, to, msg, strategy).await } }); // Vectors in which success results and errors will be collected let mut successes = vec![]; let mut errors = vec![]; - if strategy.rs_interrupt_after_quorum { - // Case 1: once quorum is reached, other requests don't matter. - // What we do here is only send the required number of requests - // to reach a quorum, priorizing nodes with the lowest latency. - // When there are errors, we start new requests to compensate. - - // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to); - let mut ord_requests = vec![(); request_order.len()] - .into_iter() - .map(|_| None) - .collect::<Vec<_>>(); - for (to, fut) in requests { - let i = request_order.iter().position(|x| *x == to).unwrap(); - ord_requests[i] = Some((to, fut)); + // resp_stream will contain all of the requests that are currently in flight. + // (for the moment none, they will be added in the loop below) + let mut resp_stream = FuturesUnordered::new(); + + // Do some requests and collect results + while successes.len() < quorum { + // If the current set of requests that are running is not enough to possibly + // reach quorum, start some new requests. + while send_all_at_once || successes.len() + resp_stream.len() < quorum { + if let Some(fut) = requests.next() { + resp_stream.push(fut) + } else { + break; + } + } + + if successes.len() + resp_stream.len() < quorum { + // We know we won't ever reach quorum + break; } - // Make an iterator to take requests in their sorted order - let mut requests = ord_requests.into_iter().map(Option::unwrap); - - // resp_stream will contain all of the requests that are currently in flight. - // (for the moment none, they will be added in the loop below) - let mut resp_stream = FuturesUnordered::new(); - - // Do some requests and collect results - 'request_loop: while successes.len() < quorum { - // If the current set of requests that are running is not enough to possibly - // reach quorum, start some new requests. - while successes.len() + resp_stream.len() < quorum { - if let Some((req_to, fut)) = requests.next() { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(tokio::spawn( - fut.with_context(Context::current_with_span(span)), - )); - } else { - // If we have no request to add, we know that we won't ever - // reach quorum: bail out now. - break 'request_loop; - } + // Wait for one request to terminate + match resp_stream.next().await.unwrap() { + Ok(msg) => { + successes.push(msg); } - assert!(!resp_stream.is_empty()); // because of loop invariants - - // Wait for one request to terminate - match resp_stream.next().await.unwrap().unwrap() { - Ok(msg) => { - successes.push(msg); - } - Err(e) => { - errors.push(e); - } + Err(e) => { + errors.push(e); } } + } + + if successes.len() >= quorum { + Ok(successes) } else { - // Case 2: all of the requests need to be sent in all cases, - // and need to terminate. (this is the case for writes that - // must be spread to n nodes) - // Just start all the requests in parallel and return as soon - // as the quorum is reached. - let mut resp_stream = requests - .map(|(_, fut)| fut) - .collect::<FuturesUnordered<_>>(); - - while let Some(resp) = resp_stream.next().await { - match resp { - Ok(msg) => { - successes.push(msg); - if successes.len() >= quorum { - break; - } - } - Err(e) => { - errors.push(e); - } - } - } + let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); + Err(Error::Quorum( + quorum, + None, + successes.len(), + to.len(), + errors, + )) + } + } + + /// Make a RPC call to multiple servers, returning either a Vec of responses, + /// or an error if quorum could not be reached due to too many errors + /// + /// Contrary to try_call_many, this fuction is especially made for broadcast + /// write operations. In particular: + /// + /// - The request are sent to all specified nodes as soon as `try_write_many_sets` + /// is invoked. + /// + /// - When `try_write_many_sets` returns, all remaining requests that haven't + /// completed move to a background task so that they have a chance to + /// complete successfully if there are no failures. + /// + /// In addition, the nodes to which requests should be sent are divided in + /// "quorum sets", and `try_write_many_sets` only returns once a quorum + /// has been validated in each set. This is used in the case of cluster layout + /// changes, where data has to be written both in the old layout and in the + /// new one as long as all nodes have not successfully tranisitionned and + /// moved all data to the new layout. + pub async fn try_write_many_sets<M, N, H, S, T>( + &self, + endpoint: &Arc<Endpoint<M, H>>, + to_sets: &[Vec<Uuid>], + msg: N, + strategy: RequestStrategy<T>, + ) -> Result<Vec<S>, Error> + where + M: Rpc<Response = Result<S, Error>> + 'static, + N: IntoReq<M>, + H: StreamingEndpointHandler<M> + 'static, + S: Send + 'static, + T: Send + 'static, + { + let quorum = strategy + .rs_quorum + .expect("internal error: missing quorum value in try_write_many_sets"); + + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!( + "RPC [{}] try_write_many_sets (quorum {} in {} sets)", + endpoint.path(), + quorum, + to_sets.len() + ); + + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets))); + span.set_attribute(KeyValue::new("quorum", quorum as i64)); + + self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum) + .with_context(Context::current_with_span(span)) + .await + } - if !resp_stream.is_empty() { - // Continue remaining requests in background. - // Note: these requests can get interrupted on process shutdown, - // we must not count on them being executed for certain. - // For all background things that have to happen with certainty, - // they have to be put in a proper queue that is persisted to disk. + async fn try_write_many_sets_inner<M, N, H, S, T>( + &self, + endpoint: &Arc<Endpoint<M, H>>, + to_sets: &[Vec<Uuid>], + msg: N, + strategy: RequestStrategy<T>, + quorum: usize, + ) -> Result<Vec<S>, Error> + where + M: Rpc<Response = Result<S, Error>> + 'static, + N: IntoReq<M>, + H: StreamingEndpointHandler<M> + 'static, + S: Send + 'static, + T: Send + 'static, + { + // Peers may appear in many quorum sets. Here, build a list of peers, + // mapping to the index of the quorum sets in which they appear. + let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum); + + let (strategy, drop_on_complete) = strategy.extract_drop_on_complete(); + + // Send one request to each peer of the quorum sets + let msg = msg.into_req().map_err(garage_net::error::Error::from)?; + let requests = result_tracker.nodes.keys().map(|peer| { + let self2 = self.clone(); + let msg = msg.clone(); + let endpoint2 = endpoint.clone(); + let to = *peer; + let strategy = strategy.clone(); + async move { (to, self2.call(&endpoint2, to, msg, strategy).await) } + }); + let mut resp_stream = requests.collect::<FuturesUnordered<_>>(); + + // Drive requests to completion + while let Some((node, resp)) = resp_stream.next().await { + // Store the response in the correct vector and increment the + // appropriate counters + result_tracker.register_result(node, resp); + + // If we have a quorum of ok in all quorum sets, then it's a success! + if result_tracker.all_quorums_ok() { + // Continue all other requets in background tokio::spawn(async move { - resp_stream.collect::<Vec<Result<_, _>>>().await; + resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await; drop(drop_on_complete); }); + + return Ok(result_tracker.success_values()); + } + + // If there is a quorum set for which too many errors were received, + // we know it's impossible to get a quorum, so return immediately. + if result_tracker.too_many_failures() { + break; } } - if successes.len() >= quorum { - Ok(successes) - } else { - let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); - Err(Error::Quorum(quorum, successes.len(), to.len(), errors)) + // At this point, there is no quorum and we know that a quorum + // will never be achieved. Currently, we drop all remaining requests. + // Should we still move them to background so that they can continue + // for non-failed nodes? Not doing so has no impact on correctness, + // but it means that more cancellation messages will be sent. Idk. + // (When an in-progress request future is dropped, Netapp automatically + // sends a cancellation message to the remote node to inform it that + // the result is no longer needed. In turn, if the remote node receives + // the cancellation message in time, it interrupts the task of the + // running request handler.) + + // Failure, could not get quorum + Err(result_tracker.quorum_error()) + } + + // ---- functions not related to MAKING RPCs, but just determining to what nodes + // they should be made and in which order ---- + + pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec<Uuid> { + let layout = self.0.layout.read().unwrap(); + + let mut ret = Vec::with_capacity(12); + let ver_iter = layout + .versions() + .iter() + .rev() + .chain(layout.inner().old_versions.iter().rev()); + for ver in ver_iter { + if ver.version > layout.sync_map_min() { + continue; + } + let nodes = ver.nodes_of(position, ver.replication_factor); + for node in rpc_helper.request_order(layout.current(), nodes) { + if !ret.contains(&node) { + ret.push(node); + } + } } + ret } - pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> { + fn request_order( + &self, + layout: &LayoutVersion, + nodes: impl Iterator<Item = Uuid>, + ) -> Vec<Uuid> { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.peering.get_peer_list(); - let ring: Arc<Ring> = self.0.ring.borrow().clone(); - let our_zone = match ring.layout.node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; + let our_zone = layout.get_node_zone(&self.0.our_node_id).unwrap_or(""); // Augment requests with some information used to sort them. // The tuples are as follows: @@ -451,22 +579,18 @@ impl RpcHelper { // By sorting this vec, we priorize ourself, then nodes in the same zone, // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes - .iter() .map(|to| { - let peer_zone = match ring.layout.node_role(to) { - Some(pc) => &pc.zone, - None => "", - }; + let peer_zone = layout.get_node_zone(&to).unwrap_or(""); let peer_avg_ping = peer_list .iter() .find(|x| x.id.as_ref() == to.as_slice()) .and_then(|pi| pi.avg_ping) .unwrap_or_else(|| Duration::from_secs(10)); ( - *to != self.0.our_node_id, + to != self.0.our_node_id, peer_zone != our_zone, peer_avg_ping, - *to, + to, ) }) .collect::<Vec<_>>(); @@ -480,3 +604,108 @@ impl RpcHelper { .collect::<Vec<_>>() } } + +// ------- utility for tracking successes/errors among write sets -------- + +pub struct QuorumSetResultTracker<S, E> { + /// The set of nodes and the index of the quorum sets they belong to + pub nodes: HashMap<Uuid, Vec<usize>>, + /// The quorum value, i.e. number of success responses to await in each set + pub quorum: usize, + + /// The success responses received + pub successes: Vec<(Uuid, S)>, + /// The error responses received + pub failures: Vec<(Uuid, E)>, + + /// The counters for successes in each set + pub success_counters: Box<[usize]>, + /// The counters for failures in each set + pub failure_counters: Box<[usize]>, + /// The total number of nodes in each set + pub set_lens: Box<[usize]>, +} + +impl<S, E> QuorumSetResultTracker<S, E> +where + E: std::fmt::Display, +{ + pub fn new<A>(sets: &[A], quorum: usize) -> Self + where + A: AsRef<[Uuid]>, + { + let mut nodes = HashMap::<Uuid, Vec<usize>>::new(); + for (i, set) in sets.iter().enumerate() { + for node in set.as_ref().iter() { + nodes.entry(*node).or_default().push(i); + } + } + + let num_nodes = nodes.len(); + Self { + nodes, + quorum, + successes: Vec::with_capacity(num_nodes), + failures: vec![], + success_counters: vec![0; sets.len()].into_boxed_slice(), + failure_counters: vec![0; sets.len()].into_boxed_slice(), + set_lens: sets + .iter() + .map(|x| x.as_ref().len()) + .collect::<Vec<_>>() + .into_boxed_slice(), + } + } + + pub fn register_result(&mut self, node: Uuid, result: Result<S, E>) { + match result { + Ok(s) => { + self.successes.push((node, s)); + for set in self.nodes.get(&node).unwrap().iter() { + self.success_counters[*set] += 1; + } + } + Err(e) => { + self.failures.push((node, e)); + for set in self.nodes.get(&node).unwrap().iter() { + self.failure_counters[*set] += 1; + } + } + } + } + + pub fn all_quorums_ok(&self) -> bool { + self.success_counters + .iter() + .all(|ok_cnt| *ok_cnt >= self.quorum) + } + + pub fn too_many_failures(&self) -> bool { + self.failure_counters + .iter() + .zip(self.set_lens.iter()) + .any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len) + } + + pub fn success_values(self) -> Vec<S> { + self.successes + .into_iter() + .map(|(_, x)| x) + .collect::<Vec<_>>() + } + + pub fn quorum_error(self) -> Error { + let errors = self + .failures + .iter() + .map(|(n, e)| format!("{:?}: {}", n, e)) + .collect::<Vec<_>>(); + Error::Quorum( + self.quorum, + Some(self.set_lens.len()), + self.successes.len(), + self.nodes.len(), + errors, + ) + } +} |