1 files changed, 88 insertions, 17 deletions
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index e9a9143f..f71f5ae7 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -129,6 +129,12 @@ impl RpcHelper {
 		N: IntoReq<M> + Send,
 		H: StreamingEndpointHandler<M>,
 	{
+		let tracer = opentelemetry::global::tracer("garage");
+		let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to);
+		let mut span = tracer.start(span_name);
+		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
+		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
+
 		let metric_tags = [
 			KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
 			KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
@@ -140,6 +146,7 @@ impl RpcHelper {
 		let node_id = to.into();
 		let rpc_call = endpoint
 			.call_streaming(&node_id, msg, strat.rs_priority)
+			.with_context(Context::current_with_span(span))
 			.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
 
 		let timeout = async {
@@ -182,12 +189,17 @@ impl RpcHelper {
 		N: IntoReq<M>,
 		H: StreamingEndpointHandler<M>,
 	{
+		let tracer = opentelemetry::global::tracer("garage");
+		let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len());
+		let span = tracer.start(span_name);
+
 		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
 
 		let resps = join_all(
 			to.iter()
 				.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
 		)
+		.with_context(Context::current_with_span(span))
 		.await;
 		Ok(to
 			.iter()
@@ -219,6 +231,22 @@ impl RpcHelper {
 
 	/// Make a RPC call to multiple servers, returning either a Vec of responses,
 	/// or an error if quorum could not be reached due to too many errors
+	///
+	/// If RequestStrategy has send_all_at_once set, then all requests will be
+	/// sent at once, and `try_call_many` will return as soon as a quorum of
+	/// responses is achieved, dropping and cancelling the remaining requests.
+	///
+	/// Otherwise, `quorum` requests will be sent at the same time, and if an
+	/// error response is received, a new request will be sent to replace it.
+	/// The ordering of nodes to which requests are sent is determined by
+	/// the `RpcHelper::request_order` function, which takes into account
+	/// parameters such as node zones and measured ping values.
+	///
+	/// In both cases, the basic contract of this function is that even in the
+	/// absence of failures, the RPC call might not be driven to completion
+	/// on all of the specified nodes. It is therefore unfit for broadcast
+	/// write operations where we expect all nodes to successfully store
+	/// the written date.
 	pub async fn try_call_many<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
@@ -235,7 +263,12 @@ impl RpcHelper {
 		let quorum = strategy.rs_quorum.unwrap_or(to.len());
 
 		let tracer = opentelemetry::global::tracer("garage");
-		let span_name = format!("Read RPC {} to {} of {}", endpoint.path(), quorum, to.len());
+		let span_name = format!(
+			"RPC [{}] try_call_many (quorum {}/{})",
+			endpoint.path(),
+			quorum,
+			to.len()
+		);
 
 		let mut span = tracer.start(span_name);
 		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
@@ -266,6 +299,10 @@ impl RpcHelper {
 		// to reach a quorum, priorizing nodes with the lowest latency.
 		// When there are errors, we start new requests to compensate.
 
+		// TODO: this could be made more aggressive, e.g. if after 2x the
+		// average ping of a given request, the response is not yet received,
+		// preemptively send an additional request to any remaining nodes.
+
 		// Reorder requests to priorize closeness / low latency
 		let request_order = self.request_order(to.iter().copied());
 		let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);
@@ -278,9 +315,7 @@ impl RpcHelper {
 			let self2 = self.clone();
 			let msg = msg.clone();
 			let endpoint2 = endpoint.clone();
-			(to, async move {
-				self2.call(&endpoint2, to, msg, strategy).await
-			})
+			async move { self2.call(&endpoint2, to, msg, strategy).await }
 		});
 
 		// Vectors in which success results and errors will be collected
@@ -296,10 +331,8 @@ impl RpcHelper {
 			// If the current set of requests that are running is not enough to possibly
 			// reach quorum, start some new requests.
 			while send_all_at_once || successes.len() + resp_stream.len() < quorum {
-				if let Some((req_to, fut)) = requests.next() {
-					let tracer = opentelemetry::global::tracer("garage");
-					let span = tracer.start(format!("RPC to {:?}", req_to));
-					resp_stream.push(fut.with_context(Context::current_with_span(span)));
+				if let Some(fut) = requests.next() {
+					resp_stream.push(fut)
 				} else {
 					break;
 				}
@@ -379,6 +412,25 @@ impl RpcHelper {
 			.collect::<Vec<_>>()
 	}
 
+	/// Make a RPC call to multiple servers, returning either a Vec of responses,
+	/// or an error if quorum could not be reached due to too many errors
+	///
+	/// Contrary to try_call_many, this fuction is especially made for broadcast
+	/// write operations. In particular:
+	///
+	/// - The request are sent to all specified nodes as soon as `try_write_many_sets`
+	///   is invoked.
+	///
+	/// - When `try_write_many_sets` returns, all remaining requests that haven't
+	///   completed move to a background task so that they have a chance to
+	///   complete successfully if there are no failures.
+	///
+	/// In addition, the nodes to which requests should be sent are divided in
+	/// "quorum sets", and `try_write_many_sets` only returns once a quorum
+	/// has been validated in each set. This is used in the case of cluster layout
+	/// changes, where data has to be written both in the old layout and in the
+	/// new one as long as all nodes have not successfully tranisitionned and
+	/// moved all data to the new layout.
 	pub async fn try_write_many_sets<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
@@ -394,11 +446,11 @@ impl RpcHelper {
 	{
 		let quorum = strategy
 			.rs_quorum
-			.expect("internal error: missing quroum in try_write_many_sets");
+			.expect("internal error: missing quorum value in try_write_many_sets");
 
 		let tracer = opentelemetry::global::tracer("garage");
 		let span_name = format!(
-			"Write RPC {} (quorum {} in {} sets)",
+			"RPC [{}] try_write_many_sets (quorum {} in {} sets)",
 			endpoint.path(),
 			quorum,
 			to_sets.len()
@@ -430,6 +482,8 @@ impl RpcHelper {
 	{
 		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
 
+		// Peers may appear in many quorum sets. Here, build a list of peers,
+		// mapping to the index of the quorum sets in which they appear.
 		let mut peers = HashMap::<Uuid, Vec<usize>>::new();
 		for (i, set) in to_sets.iter().enumerate() {
 			for peer in set.iter() {
@@ -437,24 +491,30 @@ impl RpcHelper {
 			}
 		}
 
+		// Send one request to each peer of the quorum sets
 		let requests = peers.iter().map(|(peer, _)| {
 			let self2 = self.clone();
 			let msg = msg.clone();
 			let endpoint2 = endpoint.clone();
 			let to = *peer;
-			let tracer = opentelemetry::global::tracer("garage");
-			let span = tracer.start(format!("RPC to {:?}", to));
-			let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) };
-			fut.with_context(Context::current_with_span(span))
+			async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }
 		});
 		let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
 
+		// Success and error responses will be collected in these two vectors
 		let mut successes = vec![];
 		let mut errors = vec![];
 
+		// `set_counters` is used to keep track of how many success and error
+		// responses are received within each quorum set. When a node returns
+		// its response, it counts as a sucess/an error for all of the quorum
+		// sets which it is part of.
 		let mut set_counters = vec![(0, 0); to_sets.len()];
 
+		// Drive requests to completion
 		while let Some((node, resp)) = resp_stream.next().await {
+			// Store the response in the correct vector and increment the
+			// appropriate counters
 			match resp {
 				Ok(msg) => {
 					for set in peers.get(&node).unwrap().iter() {
@@ -470,9 +530,8 @@ impl RpcHelper {
 				}
 			}
 
+			// If we have a quorum of ok in all quorum sets, then it's a success!
 			if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) {
-				// Success
-
 				// Continue all other requets in background
 				tokio::spawn(async move {
 					resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
@@ -481,16 +540,28 @@ impl RpcHelper {
 				return Ok(successes);
 			}
 
+			// If there is a quorum set for which too many errors were received,
+			// we know it's impossible to get a quorum, so return immediately.
 			if set_counters
 				.iter()
 				.enumerate()
 				.any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len())
 			{
-				// Too many errors in this set, we know we won't get a quorum
 				break;
 			}
 		}
 
+		// At this point, there is no quorum and we know that a quorum
+		// will never be achieved. Currently, we drop all remaining requests.
+		// Should we still move them to background so that they can continue
+		// for non-failed nodes? Not doing so has no impact on correctness,
+		// but it means that more cancellation messages will be sent. Idk.
+		// (When an in-progress request future is dropped, Netapp automatically
+		// sends a cancellation message to the remote node to inform it that
+		// the result is no longer needed. In turn, if the remote node receives
+		// the cancellation message in time, it interrupts the task of the
+		// running request handler.)
+
 		// Failure, could not get quorum
 		let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
 		Err(Error::Quorum(