1 files changed, 159 insertions, 131 deletions
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index 0d722e43..97716b18 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -6,10 +6,15 @@ use futures::future::join_all;
 use futures::stream::futures_unordered::FuturesUnordered;
 use futures::stream::StreamExt;
 use futures_util::future::FutureExt;
-use opentelemetry::KeyValue;
 use tokio::select;
 use tokio::sync::{watch, Semaphore};
 
+use opentelemetry::KeyValue;
+use opentelemetry::{
+	trace::{FutureExt as OtelFutureExt, Span, TraceContextExt, Tracer},
+	Context,
+};
+
 pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc};
 use netapp::peering::fullmesh::FullMeshPeeringStrategy;
 pub use netapp::proto::*;
@@ -147,9 +152,17 @@ impl RpcHelper {
 		self.0.metrics.rpc_counter.add(1, &metric_tags);
 		let rpc_start_time = SystemTime::now();
 
+		let tracer = opentelemetry::global::tracer("garage");
+		let mut span = tracer.start(format!("RPC {}", endpoint.path()));
+		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
+
 		let node_id = to.into();
+		let rpc_call = endpoint
+			.call(&node_id, &msg, strat.rs_priority)
+			.with_context(Context::current_with_span(span));
+
 		select! {
-			res = endpoint.call(&node_id, &msg, strat.rs_priority) => {
+			res = rpc_call => {
 				drop(permit);
 
 				if res.is_err() {
@@ -228,149 +241,164 @@ impl RpcHelper {
 	where
 		M: Rpc<Response = Result<S, Error>> + 'static,
 		H: EndpointHandler<M> + 'static,
-		S: Send,
+		S: Send + 'static,
 	{
-		let msg = Arc::new(msg);
-
-		// Build future for each request
-		// They are not started now: they are added below in a FuturesUnordered
-		// object that will take care of polling them (see below)
-		let requests = to.iter().cloned().map(|to| {
-			let self2 = self.clone();
-			let msg = msg.clone();
-			let endpoint2 = endpoint.clone();
-			(to, async move {
-				self2.call_arc(&endpoint2, to, msg, strategy).await
-			})
-		});
 		let quorum = strategy.rs_quorum.unwrap_or(to.len());
 
-		// Vectors in which success results and errors will be collected
-		let mut successes = vec![];
-		let mut errors = vec![];
-
-		if strategy.rs_interrupt_after_quorum {
-			// Case 1: once quorum is reached, other requests don't matter.
-			// What we do here is only send the required number of requests
-			// to reach a quorum, priorizing nodes with the lowest latency.
-			// When there are errors, we start new requests to compensate.
-
-			// Retrieve some status variables that we will use to sort requests
-			let peer_list = self.0.fullmesh.get_peer_list();
-			let ring: Arc<Ring> = self.0.ring.borrow().clone();
-			let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
-				Some(pc) => &pc.zone,
-				None => "",
-			};
-
-			// Augment requests with some information used to sort them.
-			// The tuples are as follows:
-			//         (is another node?, is another zone?, latency, node ID, request future)
-			// We store all of these tuples in a vec that we can sort.
-			// By sorting this vec, we priorize ourself, then nodes in the same zone,
-			// and within a same zone we priorize nodes with the lowest latency.
-			let mut requests = requests
-				.map(|(to, fut)| {
-					let peer_zone = match ring.layout.node_role(&to) {
-						Some(pc) => &pc.zone,
-						None => "",
-					};
-					let peer_avg_ping = peer_list
-						.iter()
-						.find(|x| x.id.as_ref() == to.as_slice())
-						.map(|pi| pi.avg_ping)
-						.flatten()
-						.unwrap_or_else(|| Duration::from_secs(1));
-					(
-						to != self.0.our_node_id,
-						peer_zone != our_zone,
-						peer_avg_ping,
-						to,
-						fut,
-					)
+		let tracer = opentelemetry::global::tracer("garage");
+		let mut span = tracer.start(format!("RPC {} to {:?}", endpoint.path(), to));
+		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
+		span.set_attribute(KeyValue::new("quorum", quorum as i64));
+
+		async {
+			let msg = Arc::new(msg);
+
+			// Build future for each request
+			// They are not started now: they are added below in a FuturesUnordered
+			// object that will take care of polling them (see below)
+			let requests = to.iter().cloned().map(|to| {
+				let self2 = self.clone();
+				let msg = msg.clone();
+				let endpoint2 = endpoint.clone();
+				(to, async move {
+					self2.call_arc(&endpoint2, to, msg, strategy).await
 				})
-				.collect::<Vec<_>>();
-
-			// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
-			requests
-				.sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| (*diffnode, *diffzone, *ping));
-
-			// Make an iterator to take requests in their sorted order
-			let mut requests = requests.into_iter();
-
-			// resp_stream will contain all of the requests that are currently in flight.
-			// (for the moment none, they will be added in the loop below)
-			let mut resp_stream = FuturesUnordered::new();
-
-			// Do some requests and collect results
-			'request_loop: while successes.len() < quorum {
-				// If the current set of requests that are running is not enough to possibly
-				// reach quorum, start some new requests.
-				while successes.len() + resp_stream.len() < quorum {
-					if let Some((_, _, _, _to, fut)) = requests.next() {
-						resp_stream.push(fut);
-					} else {
-						// If we have no request to add, we know that we won't ever
-						// reach quorum: bail out now.
-						break 'request_loop;
-					}
-				}
-				assert!(!resp_stream.is_empty()); // because of loop invariants
+			});
+
+			// Vectors in which success results and errors will be collected
+			let mut successes = vec![];
+			let mut errors = vec![];
+
+			if strategy.rs_interrupt_after_quorum {
+				// Case 1: once quorum is reached, other requests don't matter.
+				// What we do here is only send the required number of requests
+				// to reach a quorum, priorizing nodes with the lowest latency.
+				// When there are errors, we start new requests to compensate.
+
+				// Retrieve some status variables that we will use to sort requests
+				let peer_list = self.0.fullmesh.get_peer_list();
+				let ring: Arc<Ring> = self.0.ring.borrow().clone();
+				let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
+					Some(pc) => &pc.zone,
+					None => "",
+				};
+
+				// Augment requests with some information used to sort them.
+				// The tuples are as follows:
+				//         (is another node?, is another zone?, latency, node ID, request future)
+				// We store all of these tuples in a vec that we can sort.
+				// By sorting this vec, we priorize ourself, then nodes in the same zone,
+				// and within a same zone we priorize nodes with the lowest latency.
+				let mut requests = requests
+					.map(|(to, fut)| {
+						let peer_zone = match ring.layout.node_role(&to) {
+							Some(pc) => &pc.zone,
+							None => "",
+						};
+						let peer_avg_ping = peer_list
+							.iter()
+							.find(|x| x.id.as_ref() == to.as_slice())
+							.map(|pi| pi.avg_ping)
+							.flatten()
+							.unwrap_or_else(|| Duration::from_secs(1));
+						(
+							to != self.0.our_node_id,
+							peer_zone != our_zone,
+							peer_avg_ping,
+							to,
+							fut,
+						)
+					})
+					.collect::<Vec<_>>();
+
+				// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
+				requests.sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| {
+					(*diffnode, *diffzone, *ping)
+				});
 
-				// Wait for one request to terminate
-				match resp_stream.next().await.unwrap() {
-					Ok(msg) => {
-						successes.push(msg);
+				// Make an iterator to take requests in their sorted order
+				let mut requests = requests.into_iter();
+
+				// resp_stream will contain all of the requests that are currently in flight.
+				// (for the moment none, they will be added in the loop below)
+				let mut resp_stream = FuturesUnordered::new();
+
+				// Do some requests and collect results
+				'request_loop: while successes.len() < quorum {
+					// If the current set of requests that are running is not enough to possibly
+					// reach quorum, start some new requests.
+					while successes.len() + resp_stream.len() < quorum {
+						if let Some((_, _, _, req_to, fut)) = requests.next() {
+							let tracer = opentelemetry::global::tracer("garage");
+							let span = tracer.start(format!("RPC to {:?}", req_to));
+							resp_stream.push(tokio::spawn(
+								fut.with_context(Context::current_with_span(span)),
+							));
+						} else {
+							// If we have no request to add, we know that we won't ever
+							// reach quorum: bail out now.
+							break 'request_loop;
+						}
 					}
-					Err(e) => {
-						errors.push(e);
+					assert!(!resp_stream.is_empty()); // because of loop invariants
+
+					// Wait for one request to terminate
+					match resp_stream.next().await.unwrap().unwrap() {
+						Ok(msg) => {
+							successes.push(msg);
+						}
+						Err(e) => {
+							errors.push(e);
+						}
 					}
 				}
-			}
-		} else {
-			// Case 2: all of the requests need to be sent in all cases,
-			// and need to terminate. (this is the case for writes that
-			// must be spread to n nodes)
-			// Just start all the requests in parallel and return as soon
-			// as the quorum is reached.
-			let mut resp_stream = requests
-				.map(|(_, fut)| fut)
-				.collect::<FuturesUnordered<_>>();
-
-			while let Some(resp) = resp_stream.next().await {
-				match resp {
-					Ok(msg) => {
-						successes.push(msg);
-						if successes.len() >= quorum {
-							break;
+			} else {
+				// Case 2: all of the requests need to be sent in all cases,
+				// and need to terminate. (this is the case for writes that
+				// must be spread to n nodes)
+				// Just start all the requests in parallel and return as soon
+				// as the quorum is reached.
+				let mut resp_stream = requests
+					.map(|(_, fut)| fut)
+					.collect::<FuturesUnordered<_>>();
+
+				while let Some(resp) = resp_stream.next().await {
+					match resp {
+						Ok(msg) => {
+							successes.push(msg);
+							if successes.len() >= quorum {
+								break;
+							}
+						}
+						Err(e) => {
+							errors.push(e);
 						}
-					}
-					Err(e) => {
-						errors.push(e);
 					}
 				}
-			}
 
-			if !resp_stream.is_empty() {
-				// Continue remaining requests in background.
-				// Continue the remaining requests immediately using tokio::spawn
-				// but enqueue a task in the background runner
-				// to ensure that the process won't exit until the requests are done
-				// (if we had just enqueued the resp_stream.collect directly in the background runner,
-				// the requests might have been put on hold in the background runner's queue,
-				// in which case they might timeout or otherwise fail)
-				let wait_finished_fut = tokio::spawn(async move {
-					resp_stream.collect::<Vec<Result<_, _>>>().await;
-				});
-				self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
+				if !resp_stream.is_empty() {
+					// Continue remaining requests in background.
+					// Continue the remaining requests immediately using tokio::spawn
+					// but enqueue a task in the background runner
+					// to ensure that the process won't exit until the requests are done
+					// (if we had just enqueued the resp_stream.collect directly in the background runner,
+					// the requests might have been put on hold in the background runner's queue,
+					// in which case they might timeout or otherwise fail)
+					let wait_finished_fut = tokio::spawn(async move {
+						resp_stream.collect::<Vec<Result<_, _>>>().await;
+					});
+					self.0.background.spawn(wait_finished_fut.map(|_| Ok(())));
+				}
 			}
-		}
 
-		if successes.len() >= quorum {
-			Ok(successes)
-		} else {
-			let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
-			Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
+			if successes.len() >= quorum {
+				Ok(successes)
+			} else {
+				let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
+				Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
+			}
 		}
+		.with_context(Context::current_with_span(span))
+		.await
 	}
 }