aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-12-07 11:50:00 +0100
committerAlex Auvolat <alex@adnab.me>2023-12-07 11:56:14 +0100
commitaa59059a910eb6e1e824b84413a66909d697ef8a (patch)
tree967e1ebbc2772b5d3edda68d28acc9f346a80ca8
parentd90de365b3b30cb631b22fcd62c98bddb5a91549 (diff)
downloadgarage-aa59059a910eb6e1e824b84413a66909d697ef8a.tar.gz
garage-aa59059a910eb6e1e824b84413a66909d697ef8a.zip
layout cli: safer skip-dead-nodes command
-rw-r--r--src/garage/cli/cmd.rs23
-rw-r--r--src/garage/cli/layout.rs35
-rw-r--r--src/garage/cli/structs.rs12
3 files changed, 49 insertions, 21 deletions
diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs
index c7f0ad2b..196c0cb3 100644
--- a/src/garage/cli/cmd.rs
+++ b/src/garage/cli/cmd.rs
@@ -49,13 +49,7 @@ pub async fn cli_command_dispatch(
}
pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
- let status = match rpc_cli
- .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
- .await??
- {
- SystemRpc::ReturnKnownNodes(nodes) => nodes,
- resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
- };
+ let status = fetch_status(rpc_cli, rpc_host).await?;
let layout = fetch_layout(rpc_cli, rpc_host).await?;
println!("==== HEALTHY NODES ====");
@@ -268,3 +262,18 @@ pub async fn cmd_admin(
}
Ok(())
}
+
+// ---- utility ----
+
+pub async fn fetch_status(
+ rpc_cli: &Endpoint<SystemRpc, ()>,
+ rpc_host: NodeID,
+) -> Result<Vec<KnownNodeInfo>, Error> {
+ match rpc_cli
+ .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
+ .await??
+ {
+ SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),
+ resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
+ }
+}
diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs
index 3c7843bd..cdf77c04 100644
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@@ -33,8 +33,8 @@ pub async fn cli_layout_command_dispatch(
cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await
}
LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await,
- LayoutOperation::AssumeSync(assume_sync_opt) => {
- cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await
+ LayoutOperation::SkipDeadNodes(assume_sync_opt) => {
+ cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await
}
}
}
@@ -388,13 +388,21 @@ pub async fn cmd_layout_history(
Ok(())
}
-pub async fn cmd_layout_assume_sync(
+pub async fn cmd_layout_skip_dead_nodes(
rpc_cli: &Endpoint<SystemRpc, ()>,
rpc_host: NodeID,
- opt: AssumeSyncOpt,
+ opt: SkipDeadNodesOpt,
) -> Result<(), Error> {
+ let status = fetch_status(rpc_cli, rpc_host).await?;
let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
+ if layout.versions.len() == 1 {
+ return Err(Error::Message(
+ "This command cannot be called when there is only one live cluster layout version"
+ .into(),
+ ));
+ }
+
let min_v = layout.min_stored();
if opt.version <= min_v || opt.version > layout.current().version {
return Err(Error::Message(format!(
@@ -408,12 +416,19 @@ pub async fn cmd_layout_assume_sync(
let all_nodes = layout.get_all_nodes();
for node in all_nodes.iter() {
- layout.update_trackers.ack_map.set_max(*node, opt.version);
- layout.update_trackers.sync_map.set_max(*node, opt.version);
- layout
- .update_trackers
- .sync_ack_map
- .set_max(*node, opt.version);
+ if status.iter().any(|x| x.id == *node && x.is_up) {
+ continue;
+ }
+
+ if layout.update_trackers.ack_map.set_max(*node, opt.version) {
+ println!("Increased the ACK tracker for node {:?}", node);
+ }
+
+ if opt.allow_missing_data {
+ if layout.update_trackers.sync_map.set_max(*node, opt.version) {
+ println!("Increased the SYNC tracker for node {:?}", node);
+ }
+ }
}
send_layout(rpc_cli, rpc_host, layout).await?;
diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs
index c4b400f4..6bc3da22 100644
--- a/src/garage/cli/structs.rs
+++ b/src/garage/cli/structs.rs
@@ -117,9 +117,9 @@ pub enum LayoutOperation {
#[structopt(name = "history", version = garage_version())]
History,
- /// Assume all nodes are synchronized up to a certain layout version
- #[structopt(name = "assume-sync", version = garage_version())]
- AssumeSync(AssumeSyncOpt),
+ /// Skip dead nodes when awaiting for a new layout version to be synchronized
+ #[structopt(name = "skip-dead-nodes", version = garage_version())]
+ SkipDeadNodes(SkipDeadNodesOpt),
}
#[derive(StructOpt, Debug)]
@@ -178,11 +178,15 @@ pub struct RevertLayoutOpt {
}
#[derive(StructOpt, Debug)]
-pub struct AssumeSyncOpt {
+pub struct SkipDeadNodesOpt {
/// Version number of the layout to assume is currently up-to-date.
/// This will generally be the current layout version.
#[structopt(long = "version")]
pub(crate) version: u64,
+ /// Allow the skip even if a quorum of ndoes could not be found for
+ /// the data among the remaining nodes
+ #[structopt(long = "allow-missing-data")]
+ pub(crate) allow_missing_data: bool,
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]