Merge remote-tracking branch 'origin/main' into optimal-layout

author: Mendes <mendes.oulamara@pm.me> 2022-10-04 18:14:49 +0200
committer: Mendes <mendes.oulamara@pm.me> 2022-10-04 18:14:49 +0200
commit: 829f815a897b04986559910bbcbf53625adcdf20 (patch)
tree: 6db3c27cff2aded754a641d1f2b05c83be701267 /src/rpc
parent: 99f96b9564c9c841dc6c56f1255a6e70ff884d46 (diff)
parent: a096ced35562bd0a8877a1ee2f755be1edafe343 (diff)
download: garage-829f815a897b04986559910bbcbf53625adcdf20.tar.gz
garage-829f815a897b04986559910bbcbf53625adcdf20.zip
6 files changed, 375 insertions, 209 deletions
diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml
index 654c1dc6..5bb6aae0 100644
--- a/src/rpc/Cargo.toml
+++ b/src/rpc/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_rpc"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,8 +14,7 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_util = { version = "0.7.0", path = "../util" }
-garage_admin = { version = "0.7.0", path = "../admin" }
+garage_util = { version = "0.8.0", path = "../util" }
 
 arc-swap = "1.0"
 bytes = "1.0"
@@ -47,11 +46,11 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi
 tokio-stream = { version = "0.1", features = ["net"] }
 opentelemetry = "0.17"
 
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] }
-netapp = { version = "0.4.2", features = ["telemetry"] }
+netapp = { version = "0.5.2", features = ["telemetry"] }
 
 hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] }
 
+
 [features]
 kubernetes-discovery = [ "kube", "k8s-openapi", "openssl", "schemars" ]
+system-libs = [ "sodiumoxide/use-pkg-config" ]
diff --git a/src/rpc/kubernetes.rs b/src/rpc/kubernetes.rs
index 939a0eed..197245aa 100644
--- a/src/rpc/kubernetes.rs
+++ b/src/rpc/kubernetes.rs
@@ -56,7 +56,7 @@ pub async fn get_kubernetes_nodes(
 	let mut ret = Vec::with_capacity(nodes.items.len());
 
 	for node in nodes {
-		println!("Found Pod: {:?}", node.metadata.name);
+		info!("Found Pod: {:?}", node.metadata.name);
 
 		let pubkey = &node
 			.metadata
diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs
index a878f19c..16d573c7 100644
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 
 use garage_util::crdt::{AutoCrdt, Crdt, LwwMap};
 use garage_util::data::*;
+use garage_util::error::*;
 
 use crate::graph_algo::*;
 
@@ -144,6 +145,61 @@ impl ClusterLayout {
 		}
 	}
 
+	pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
+		match version {
+			None => {
+				let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+				"#;
+				return Err(Error::Message(error.into()));
+			}
+			Some(v) => {
+				if v != self.version + 1 {
+					return Err(Error::Message("Invalid new layout version".into()));
+				}
+			}
+		}
+
+		self.roles.merge(&self.staging);
+		self.roles.retain(|(_, _, v)| v.0.is_some());
+
+		if !self.calculate_partition_assignation() {
+			return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
+		}
+
+		self.staging.clear();
+		self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
+
+		self.version += 1;
+
+		Ok(self)
+	}
+
+	pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
+		match version {
+			None => {
+				let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+				"#;
+				return Err(Error::Message(error.into()));
+			}
+			Some(v) => {
+				if v != self.version + 1 {
+					return Err(Error::Message("Invalid new layout version".into()));
+				}
+			}
+		}
+
+		self.staging.clear();
+		self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
+
+		self.version += 1;
+
+		Ok(self)
+	}
+
 	/// Returns a list of IDs of nodes that currently have
 	/// a role in the cluster
 	pub fn node_ids(&self) -> &[Uuid] {
diff --git a/src/rpc/metrics.rs b/src/rpc/metrics.rs
index c900518c..61f8fa79 100644
--- a/src/rpc/metrics.rs
+++ b/src/rpc/metrics.rs
@@ -1,31 +1,18 @@
-use std::sync::Arc;
-
 use opentelemetry::{global, metrics::*};
-use tokio::sync::Semaphore;
 
 /// TableMetrics reference all counter used for metrics
 pub struct RpcMetrics {
-	pub(crate) _rpc_available_permits: ValueObserver<u64>,
-
 	pub(crate) rpc_counter: Counter<u64>,
 	pub(crate) rpc_timeout_counter: Counter<u64>,
 	pub(crate) rpc_netapp_error_counter: Counter<u64>,
 	pub(crate) rpc_garage_error_counter: Counter<u64>,
 
 	pub(crate) rpc_duration: ValueRecorder<f64>,
-	pub(crate) rpc_queueing_time: ValueRecorder<f64>,
 }
 impl RpcMetrics {
-	pub fn new(sem: Arc<Semaphore>) -> Self {
+	pub fn new() -> Self {
 		let meter = global::meter("garage_rpc");
 		RpcMetrics {
-			_rpc_available_permits: meter
-				.u64_value_observer("rpc.available_permits", move |observer| {
-					observer.observe(sem.available_permits() as u64, &[])
-				})
-				.with_description("Number of available RPC permits")
-				.init(),
-
 			rpc_counter: meter
 				.u64_counter("rpc.request_counter")
 				.with_description("Number of RPC requests emitted")
@@ -46,10 +33,6 @@ impl RpcMetrics {
 				.f64_value_recorder("rpc.duration")
 				.with_description("Duration of RPCs")
 				.init(),
-			rpc_queueing_time: meter
-				.f64_value_recorder("rpc.queueing_time")
-				.with_description("Time RPC requests were queued for before being sent")
-				.init(),
 		}
 	}
 }
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index 34717d3b..949aced6 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -7,7 +7,7 @@ use futures::stream::futures_unordered::FuturesUnordered;
 use futures::stream::StreamExt;
 use futures_util::future::FutureExt;
 use tokio::select;
-use tokio::sync::{watch, Semaphore};
+use tokio::sync::watch;
 
 use opentelemetry::KeyValue;
 use opentelemetry::{
@@ -15,10 +15,14 @@ use opentelemetry::{
 	Context,
 };
 
-pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc};
+pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler};
+use netapp::message::IntoReq;
+pub use netapp::message::{
+	Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL,
+	PRIO_SECONDARY,
+};
 use netapp::peering::fullmesh::FullMeshPeeringStrategy;
-pub use netapp::proto::*;
-pub use netapp::{NetApp, NodeID};
+pub use netapp::{self, NetApp, NodeID};
 
 use garage_util::background::BackgroundRunner;
 use garage_util::data::*;
@@ -28,34 +32,37 @@ use garage_util::metrics::RecordDuration;
 use crate::metrics::RpcMetrics;
 use crate::ring::Ring;
 
-const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
-
-// Try to never have more than 200MB of outgoing requests
-// buffered at the same time. Other requests are queued until
-// space is freed.
-const REQUEST_BUFFER_SIZE: usize = 200 * 1024 * 1024;
+// Default RPC timeout = 5 minutes
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
 
 /// Strategy to apply when making RPC
 #[derive(Copy, Clone)]
 pub struct RequestStrategy {
-	/// Max time to wait for reponse
-	pub rs_timeout: Duration,
 	/// Min number of response to consider the request successful
 	pub rs_quorum: Option<usize>,
 	/// Should requests be dropped after enough response are received
 	pub rs_interrupt_after_quorum: bool,
 	/// Request priority
 	pub rs_priority: RequestPriority,
+	/// Custom timeout for this request
+	rs_timeout: Timeout,
+}
+
+#[derive(Copy, Clone)]
+enum Timeout {
+	None,
+	Default,
+	Custom(Duration),
 }
 
 impl RequestStrategy {
 	/// Create a RequestStrategy with default timeout and not interrupting when quorum reached
 	pub fn with_priority(prio: RequestPriority) -> Self {
 		RequestStrategy {
-			rs_timeout: DEFAULT_TIMEOUT,
 			rs_quorum: None,
 			rs_interrupt_after_quorum: false,
 			rs_priority: prio,
+			rs_timeout: Timeout::Default,
 		}
 	}
 	/// Set quorum to be reached for request
@@ -63,17 +70,22 @@ impl RequestStrategy {
 		self.rs_quorum = Some(quorum);
 		self
 	}
-	/// Set timeout of the strategy
-	pub fn with_timeout(mut self, timeout: Duration) -> Self {
-		self.rs_timeout = timeout;
-		self
-	}
 	/// Set if requests can be dropped after quorum has been reached
 	/// In general true for read requests, and false for write
 	pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self {
 		self.rs_interrupt_after_quorum = interrupt;
 		self
 	}
+	/// Deactivate timeout for this request
+	pub fn without_timeout(mut self) -> Self {
+		self.rs_timeout = Timeout::None;
+		self
+	}
+	/// Set custom timeout for this request
+	pub fn with_custom_timeout(mut self, timeout: Duration) -> Self {
+		self.rs_timeout = Timeout::Custom(timeout);
+		self
+	}
 }
 
 #[derive(Clone)]
@@ -84,8 +96,8 @@ struct RpcHelperInner {
 	fullmesh: Arc<FullMeshPeeringStrategy>,
 	background: Arc<BackgroundRunner>,
 	ring: watch::Receiver<Arc<Ring>>,
-	request_buffer_semaphore: Arc<Semaphore>,
 	metrics: RpcMetrics,
+	rpc_timeout: Duration,
 }
 
 impl RpcHelper {
@@ -94,45 +106,35 @@ impl RpcHelper {
 		fullmesh: Arc<FullMeshPeeringStrategy>,
 		background: Arc<BackgroundRunner>,
 		ring: watch::Receiver<Arc<Ring>>,
+		rpc_timeout: Option<Duration>,
 	) -> Self {
-		let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE));
-
-		let metrics = RpcMetrics::new(sem.clone());
+		let metrics = RpcMetrics::new();
 
 		Self(Arc::new(RpcHelperInner {
 			our_node_id,
 			fullmesh,
 			background,
 			ring,
-			request_buffer_semaphore: sem,
 			metrics,
+			rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
 		}))
 	}
 
-	pub async fn call<M, H, S>(
-		&self,
-		endpoint: &Endpoint<M, H>,
-		to: Uuid,
-		msg: M,
-		strat: RequestStrategy,
-	) -> Result<S, Error>
-	where
-		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
-	{
-		self.call_arc(endpoint, to, Arc::new(msg), strat).await
+	pub fn rpc_timeout(&self) -> Duration {
+		self.0.rpc_timeout
 	}
 
-	pub async fn call_arc<M, H, S>(
+	pub async fn call<M, N, H, S>(
 		&self,
 		endpoint: &Endpoint<M, H>,
 		to: Uuid,
-		msg: Arc<M>,
+		msg: N,
 		strat: RequestStrategy,
 	) -> Result<S, Error>
 	where
 		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
+		N: IntoReq<M> + Send,
+		H: StreamingEndpointHandler<M>,
 	{
 		let metric_tags = [
 			KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
@@ -140,29 +142,27 @@ impl RpcHelper {
 			KeyValue::new("to", format!("{:?}", to)),
 		];
 
-		let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32;
-		let permit = self
-			.0
-			.request_buffer_semaphore
-			.acquire_many(msg_size)
-			.record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags)
-			.await?;
-
 		self.0.metrics.rpc_counter.add(1, &metric_tags);
 
 		let node_id = to.into();
 		let rpc_call = endpoint
-			.call(&node_id, msg, strat.rs_priority)
+			.call_streaming(&node_id, msg, strat.rs_priority)
 			.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
 
+		let timeout = async {
+			match strat.rs_timeout {
+				Timeout::None => futures::future::pending().await,
+				Timeout::Default => tokio::time::sleep(self.0.rpc_timeout).await,
+				Timeout::Custom(t) => tokio::time::sleep(t).await,
+			}
+		};
+
 		select! {
 			res = rpc_call => {
-				drop(permit);
-
 				if res.is_err() {
 					self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
 				}
-				let res = res?;
+				let res = res?.into_msg();
 
 				if res.is_err() {
 					self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
@@ -170,46 +170,49 @@ impl RpcHelper {
 
 				Ok(res?)
 			}
-			_ = tokio::time::sleep(strat.rs_timeout) => {
-				drop(permit);
+			() = timeout => {
 				self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
 				Err(Error::Timeout)
 			}
 		}
 	}
 
-	pub async fn call_many<M, H, S>(
+	pub async fn call_many<M, N, H, S>(
 		&self,
 		endpoint: &Endpoint<M, H>,
 		to: &[Uuid],
-		msg: M,
+		msg: N,
 		strat: RequestStrategy,
-	) -> Vec<(Uuid, Result<S, Error>)>
+	) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M>,
 	{
-		let msg = Arc::new(msg);
+		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
+
 		let resps = join_all(
 			to.iter()
-				.map(|to| self.call_arc(endpoint, *to, msg.clone(), strat)),
+				.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
 		)
 		.await;
-		to.iter()
+		Ok(to
+			.iter()
 			.cloned()
 			.zip(resps.into_iter())
-			.collect::<Vec<_>>()
+			.collect::<Vec<_>>())
 	}
 
-	pub async fn broadcast<M, H, S>(
+	pub async fn broadcast<M, N, H, S>(
 		&self,
 		endpoint: &Endpoint<M, H>,
-		msg: M,
+		msg: N,
 		strat: RequestStrategy,
-	) -> Vec<(Uuid, Result<S, Error>)>
+	) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M>,
 	{
 		let to = self
 			.0
@@ -223,16 +226,17 @@ impl RpcHelper {
 
 	/// Make a RPC call to multiple servers, returning either a Vec of responses,
 	/// or an error if quorum could not be reached due to too many errors
-	pub async fn try_call_many<M, H, S>(
+	pub async fn try_call_many<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
 		to: &[Uuid],
-		msg: M,
+		msg: N,
 		strategy: RequestStrategy,
 	) -> Result<Vec<S>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>> + 'static,
-		H: EndpointHandler<M> + 'static,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M> + 'static,
 		S: Send + 'static,
 	{
 		let quorum = strategy.rs_quorum.unwrap_or(to.len());
@@ -262,20 +266,21 @@ impl RpcHelper {
 			.await
 	}
 
-	async fn try_call_many_internal<M, H, S>(
+	async fn try_call_many_internal<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
 		to: &[Uuid],
-		msg: M,
+		msg: N,
 		strategy: RequestStrategy,
 		quorum: usize,
 	) -> Result<Vec<S>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>> + 'static,
-		H: EndpointHandler<M> + 'static,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M> + 'static,
 		S: Send + 'static,
 	{
-		let msg = Arc::new(msg);
+		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
 
 		// Build future for each request
 		// They are not started now: they are added below in a FuturesUnordered
@@ -285,7 +290,7 @@ impl RpcHelper {
 			let msg = msg.clone();
 			let endpoint2 = endpoint.clone();
 			(to, async move {
-				self2.call_arc(&endpoint2, to, msg, strategy).await
+				self2.call(&endpoint2, to, msg, strategy).await
 			})
 		});
 
@@ -299,47 +304,19 @@ impl RpcHelper {
 			// to reach a quorum, priorizing nodes with the lowest latency.
 			// When there are errors, we start new requests to compensate.
 
-			// Retrieve some status variables that we will use to sort requests
-			let peer_list = self.0.fullmesh.get_peer_list();
-			let ring: Arc<Ring> = self.0.ring.borrow().clone();
-			let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
-				Some(pc) => &pc.zone,
-				None => "",
-			};
-
-			// Augment requests with some information used to sort them.
-			// The tuples are as follows:
-			//         (is another node?, is another zone?, latency, node ID, request future)
-			// We store all of these tuples in a vec that we can sort.
-			// By sorting this vec, we priorize ourself, then nodes in the same zone,
-			// and within a same zone we priorize nodes with the lowest latency.
-			let mut requests = requests
-				.map(|(to, fut)| {
-					let peer_zone = match ring.layout.node_role(&to) {
-						Some(pc) => &pc.zone,
-						None => "",
-					};
-					let peer_avg_ping = peer_list
-						.iter()
-						.find(|x| x.id.as_ref() == to.as_slice())
-						.and_then(|pi| pi.avg_ping)
-						.unwrap_or_else(|| Duration::from_secs(1));
-					(
-						to != self.0.our_node_id,
-						peer_zone != our_zone,
-						peer_avg_ping,
-						to,
-						fut,
-					)
-				})
+			// Reorder requests to priorize closeness / low latency
+			let request_order = self.request_order(to);
+			let mut ord_requests = vec![(); request_order.len()]
+				.into_iter()
+				.map(|_| None)
 				.collect::<Vec<_>>();
-
-			// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
-			requests
-				.sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| (*diffnode, *diffzone, *ping));
+			for (to, fut) in requests {
+				let i = request_order.iter().position(|x| *x == to).unwrap();
+				ord_requests[i] = Some((to, fut));
+			}
 
 			// Make an iterator to take requests in their sorted order
-			let mut requests = requests.into_iter();
+			let mut requests = ord_requests.into_iter().map(Option::unwrap);
 
 			// resp_stream will contain all of the requests that are currently in flight.
 			// (for the moment none, they will be added in the loop below)
@@ -350,7 +327,7 @@ impl RpcHelper {
 				// If the current set of requests that are running is not enough to possibly
 				// reach quorum, start some new requests.
 				while successes.len() + resp_stream.len() < quorum {
-					if let Some((_, _, _, req_to, fut)) = requests.next() {
+					if let Some((req_to, fut)) = requests.next() {
 						let tracer = opentelemetry::global::tracer("garage");
 						let span = tracer.start(format!("RPC to {:?}", req_to));
 						resp_stream.push(tokio::spawn(
@@ -420,4 +397,49 @@ impl RpcHelper {
 			Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
 		}
 	}
+
+	pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> {
+		// Retrieve some status variables that we will use to sort requests
+		let peer_list = self.0.fullmesh.get_peer_list();
+		let ring: Arc<Ring> = self.0.ring.borrow().clone();
+		let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
+			Some(pc) => &pc.zone,
+			None => "",
+		};
+
+		// Augment requests with some information used to sort them.
+		// The tuples are as follows:
+		//         (is another node?, is another zone?, latency, node ID, request future)
+		// We store all of these tuples in a vec that we can sort.
+		// By sorting this vec, we priorize ourself, then nodes in the same zone,
+		// and within a same zone we priorize nodes with the lowest latency.
+		let mut nodes = nodes
+			.iter()
+			.map(|to| {
+				let peer_zone = match ring.layout.node_role(to) {
+					Some(pc) => &pc.zone,
+					None => "",
+				};
+				let peer_avg_ping = peer_list
+					.iter()
+					.find(|x| x.id.as_ref() == to.as_slice())
+					.and_then(|pi| pi.avg_ping)
+					.unwrap_or_else(|| Duration::from_secs(10));
+				(
+					*to != self.0.our_node_id,
+					peer_zone != our_zone,
+					peer_avg_ping,
+					*to,
+				)
+			})
+			.collect::<Vec<_>>();
+
+		// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
+		nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping));
+
+		nodes
+			.into_iter()
+			.map(|(_, _, _, to)| to)
+			.collect::<Vec<_>>()
+	}
 }
diff --git a/src/rpc/system.rs b/src/rpc/system.rs
index 34031b10..7eb25195 100644
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@@ -2,7 +2,7 @@
 use std::collections::HashMap;
 use std::io::{Read, Write};
 use std::net::{IpAddr, SocketAddr};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::{Arc, RwLock};
 use std::time::{Duration, Instant};
 
@@ -16,9 +16,9 @@ use tokio::sync::watch;
 use tokio::sync::Mutex;
 
 use netapp::endpoint::{Endpoint, EndpointHandler};
+use netapp::message::*;
 use netapp::peering::fullmesh::FullMeshPeeringStrategy;
-use netapp::proto::*;
-use netapp::util::parse_and_resolve_peer_addr;
+use netapp::util::parse_and_resolve_peer_addr_async;
 use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
 
 use garage_util::background::BackgroundRunner;
@@ -37,10 +37,11 @@ use crate::rpc_helper::*;
 
 const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
 const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
-const PING_TIMEOUT: Duration = Duration::from_secs(2);
 
-/// Version tag used for version check upon Netapp connection
-pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
+/// Version tag used for version check upon Netapp connection.
+/// Cluster nodes with different version tags are deemed
+/// incompatible and will refuse to connect.
+pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008
 
 /// RPC endpoint used for calls related to membership
 pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
@@ -90,7 +91,7 @@ pub struct System {
 
 	rpc_listen_addr: SocketAddr,
 	rpc_public_addr: Option<SocketAddr>,
-	bootstrap_peers: Vec<(NodeID, SocketAddr)>,
+	bootstrap_peers: Vec<String>,
 
 	consul_discovery: Option<ConsulDiscoveryParam>,
 	#[cfg(feature = "kubernetes-discovery")]
@@ -104,6 +105,9 @@ pub struct System {
 
 	/// The job runner of this node
 	pub background: Arc<BackgroundRunner>,
+
+	/// Path to metadata directory
+	pub metadata_dir: PathBuf,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -194,7 +198,7 @@ impl System {
 		replication_factor: usize,
         zone_redundancy: usize,
 		config: &Config,
-	) -> Arc<Self> {
+	) -> Result<Arc<Self>, Error> {
 		let node_key =
 			gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
 		info!(
@@ -202,11 +206,21 @@ impl System {
 			hex::encode(&node_key.public_key()[..8])
 		);
 
-		let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout");
+		let persist_cluster_layout: Persister<ClusterLayout> =
+			Persister::new(&config.metadata_dir, "cluster_layout");
 		let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
 
 		let cluster_layout = match persist_cluster_layout.load() {
-			Ok(x) => x,
+			Ok(x) => {
+				if x.replication_factor != replication_factor {
+					return Err(Error::Message(format!(
+						"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
+						x.replication_factor,
+						replication_factor
+					)));
+				}
+				x
+			}
 			Err(e) => {
 				info!(
 					"No valid previous cluster layout stored ({}), starting fresh.",
@@ -228,8 +242,29 @@ impl System {
 		let ring = Ring::new(cluster_layout, replication_factor);
 		let (update_ring, ring) = watch::channel(Arc::new(ring));
 
-		let rpc_public_addr = match config.rpc_public_addr {
-			Some(a) => Some(a),
+		let rpc_public_addr = match &config.rpc_public_addr {
+			Some(a_str) => {
+				use std::net::ToSocketAddrs;
+				match a_str.to_socket_addrs() {
+					Err(e) => {
+						error!(
+							"Cannot resolve rpc_public_addr {} from config file: {}.",
+							a_str, e
+						);
+						None
+					}
+					Ok(a) => {
+						let a = a.collect::<Vec<_>>();
+						if a.is_empty() {
+							error!("rpc_public_addr {} resolve to no known IP address", a_str);
+						}
+						if a.len() > 1 {
+							warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
+						}
+						a.into_iter().next()
+					}
+				}
+			}
 			None => {
 				let addr =
 					get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
@@ -239,13 +274,15 @@ impl System {
 				addr
 			}
 		};
+		if rpc_public_addr.is_none() {
+			warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication.");
+		}
 
 		let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key);
-		let fullmesh = FullMeshPeeringStrategy::new(
-			netapp.clone(),
-			config.bootstrap_peers.clone(),
-			rpc_public_addr,
-		);
+		let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr);
+		if let Some(ping_timeout) = config.rpc_ping_timeout_msec {
+			fullmesh.set_ping_timeout_millis(ping_timeout);
+		}
 
 		let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());
 
@@ -283,7 +320,13 @@ impl System {
 			node_status: RwLock::new(HashMap::new()),
 			netapp: netapp.clone(),
 			fullmesh: fullmesh.clone(),
-			rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()),
+			rpc: RpcHelper::new(
+				netapp.id.into(),
+				fullmesh,
+				background.clone(),
+				ring.clone(),
+				config.rpc_timeout_msec.map(Duration::from_millis),
+			),
 			system_endpoint,
 			replication_factor,
 			rpc_listen_addr: config.rpc_bind_addr,
@@ -296,9 +339,10 @@ impl System {
 			ring,
 			update_ring: Mutex::new(update_ring),
 			background,
+			metadata_dir: config.metadata_dir.clone(),
 		});
 		sys.system_endpoint.set_handler(sys.clone());
-		sys
+		Ok(sys)
 	}
 
 	/// Perform bootstraping, starting the ping loop
@@ -313,6 +357,80 @@ impl System {
 		);
 	}
 
+	// ---- Administrative operations (directly available and
+	//      also available through RPC) ----
+
+	pub fn get_known_nodes(&self) -> Vec<KnownNodeInfo> {
+		let node_status = self.node_status.read().unwrap();
+		let known_nodes = self
+			.fullmesh
+			.get_peer_list()
+			.iter()
+			.map(|n| KnownNodeInfo {
+				id: n.id.into(),
+				addr: n.addr,
+				is_up: n.is_up(),
+				last_seen_secs_ago: n
+					.last_seen
+					.map(|t| (Instant::now().saturating_duration_since(t)).as_secs()),
+				status: node_status
+					.get(&n.id.into())
+					.cloned()
+					.map(|(_, st)| st)
+					.unwrap_or(NodeStatus {
+						hostname: "?".to_string(),
+						replication_factor: 0,
+						cluster_layout_version: 0,
+						cluster_layout_staging_hash: Hash::from([0u8; 32]),
+					}),
+			})
+			.collect::<Vec<_>>();
+		known_nodes
+	}
+
+	pub fn get_cluster_layout(&self) -> ClusterLayout {
+		self.ring.borrow().layout.clone()
+	}
+
+	pub async fn update_cluster_layout(
+		self: &Arc<Self>,
+		layout: &ClusterLayout,
+	) -> Result<(), Error> {
+		self.handle_advertise_cluster_layout(layout).await?;
+		Ok(())
+	}
+
+	pub async fn connect(&self, node: &str) -> Result<(), Error> {
+		let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node)
+			.await
+			.ok_or_else(|| {
+				Error::Message(format!(
+					"Unable to parse or resolve node specification: {}",
+					node
+				))
+			})?;
+		let mut errors = vec![];
+		for ip in addrs.iter() {
+			match self
+				.netapp
+				.clone()
+				.try_connect(*ip, pubkey)
+				.await
+				.err_context(CONNECT_ERROR_MESSAGE)
+			{
+				Ok(()) => return Ok(()),
+				Err(e) => {
+					errors.push((*ip, e));
+				}
+			}
+		}
+		if errors.len() == 1 {
+			Err(Error::Message(errors[0].1.to_string()))
+		} else {
+			Err(Error::Message(format!("{:?}", errors)))
+		}
+	}
+
 	// ---- INTERNALS ----
 
 	async fn advertise_to_consul(self: Arc<Self>) -> Result<(), Error> {
@@ -385,32 +503,11 @@ impl System {
 		self.local_status.swap(Arc::new(new_si));
 	}
 
+	// --- RPC HANDLERS ---
+
 	async fn handle_connect(&self, node: &str) -> Result<SystemRpc, Error> {
-		let (pubkey, addrs) = parse_and_resolve_peer_addr(node).ok_or_else(|| {
-			Error::Message(format!(
-				"Unable to parse or resolve node specification: {}",
-				node
-			))
-		})?;
-		let mut errors = vec![];
-		for ip in addrs.iter() {
-			match self
-				.netapp
-				.clone()
-				.try_connect(*ip, pubkey)
-				.await
-				.err_context(CONNECT_ERROR_MESSAGE)
-			{
-				Ok(()) => return Ok(SystemRpc::Ok),
-				Err(e) => {
-					errors.push((*ip, e));
-				}
-			}
-		}
-		return Err(Error::Message(format!(
-			"Could not connect to specified peers. Errors: {:?}",
-			errors
-		)));
+		self.connect(node).await?;
+		Ok(SystemRpc::Ok)
 	}
 
 	fn handle_pull_cluster_layout(&self) -> SystemRpc {
@@ -419,28 +516,7 @@ impl System {
 	}
 
 	fn handle_get_known_nodes(&self) -> SystemRpc {
-		let node_status = self.node_status.read().unwrap();
-		let known_nodes = self
-			.fullmesh
-			.get_peer_list()
-			.iter()
-			.map(|n| KnownNodeInfo {
-				id: n.id.into(),
-				addr: n.addr,
-				is_up: n.is_up(),
-				last_seen_secs_ago: n.last_seen.map(|t| (Instant::now() - t).as_secs()),
-				status: node_status
-					.get(&n.id.into())
-					.cloned()
-					.map(|(_, st)| st)
-					.unwrap_or(NodeStatus {
-						hostname: "?".to_string(),
-						replication_factor: 0,
-						cluster_layout_version: 0,
-						cluster_layout_staging_hash: Hash::from([0u8; 32]),
-					}),
-			})
-			.collect::<Vec<_>>();
+		let known_nodes = self.get_known_nodes();
 		SystemRpc::ReturnKnownNodes(known_nodes)
 	}
 
@@ -452,7 +528,7 @@ impl System {
 		let local_info = self.local_status.load();
 
 		if local_info.replication_factor < info.replication_factor {
-			error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and might lead to bugs",
+			error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety.",
 				info.replication_factor,
 				local_info.replication_factor);
 			std::process::exit(1);
@@ -477,9 +553,19 @@ impl System {
 	}
 
 	async fn handle_advertise_cluster_layout(
-		self: Arc<Self>,
+		self: &Arc<Self>,
 		adv: &ClusterLayout,
 	) -> Result<SystemRpc, Error> {
+		if adv.replication_factor != self.replication_factor {
+			let msg = format!(
+				"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
+				adv.replication_factor,
+				self.replication_factor
+			);
+			error!("{}", msg);
+			return Err(Error::Message(msg));
+		}
+
 		let update_ring = self.update_ring.lock().await;
 		let mut layout: ClusterLayout = self.ring.borrow().layout.clone();
 
@@ -505,7 +591,7 @@ impl System {
 						SystemRpc::AdvertiseClusterLayout(layout),
 						RequestStrategy::with_priority(PRIO_HIGH),
 					)
-					.await;
+					.await?;
 				Ok(())
 			});
 			self.background.spawn(self.clone().save_cluster_layout());
@@ -520,11 +606,12 @@ impl System {
 
 			self.update_local_status();
 			let local_status: NodeStatus = self.local_status.load().as_ref().clone();
-			self.rpc
+			let _ = self
+				.rpc
 				.broadcast(
 					&self.system_endpoint,
 					SystemRpc::AdvertiseStatus(local_status),
-					RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+					RequestStrategy::with_priority(PRIO_HIGH),
 				)
 				.await;
 
@@ -550,7 +637,7 @@ impl System {
 			if not_configured || no_peers || bad_peers {
 				info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers);
 
-				let mut ping_list = self.bootstrap_peers.clone();
+				let mut ping_list = resolve_peers(&self.bootstrap_peers).await;
 
 				// Add peer list from list stored on disk
 				if let Ok(peers) = self.persist_peer_list.load_async().await {
@@ -648,7 +735,7 @@ impl System {
 				&self.system_endpoint,
 				peer,
 				SystemRpc::PullClusterLayout,
-				RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_HIGH),
 			)
 			.await;
 		if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
@@ -681,6 +768,25 @@ fn get_default_ip() -> Option<IpAddr> {
 		.map(|a| a.ip())
 }
 
+async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
+	let mut ret = vec![];
+
+	for peer in peers.iter() {
+		match parse_and_resolve_peer_addr_async(peer).await {
+			Some((pubkey, addrs)) => {
+				for ip in addrs {
+					ret.push((pubkey, ip));
+				}
+			}
+			None => {
+				warn!("Unable to parse and/or resolve peer hostname {}", peer);
+			}
+		}
+	}
+
+	ret
+}
+
 struct ConsulDiscoveryParam {
 	consul_host: String,
 	service_name: String,
author	Mendes <mendes.oulamara@pm.me>	2022-10-04 18:14:49 +0200
committer	Mendes <mendes.oulamara@pm.me>	2022-10-04 18:14:49 +0200
commit	829f815a897b04986559910bbcbf53625adcdf20 (patch)
tree	6db3c27cff2aded754a641d1f2b05c83be701267 /src/rpc
parent	99f96b9564c9c841dc6c56f1255a6e70ff884d46 (diff)
parent	a096ced35562bd0a8877a1ee2f755be1edafe343 (diff)
download	garage-829f815a897b04986559910bbcbf53625adcdf20.tar.gz garage-829f815a897b04986559910bbcbf53625adcdf20.zip