From 8dccee3ccfe7793c42203f28c1e91c6f989b6899 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 19:28:36 +0100 Subject: cluster layout: adapt all uses of ClusterLayout to LayoutHistory --- src/garage/cli/cmd.rs | 17 +++++++++++------ src/garage/cli/layout.rs | 38 +++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 23 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 48359614..8be43873 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,7 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.roles.get(&adv.id) { + match layout.current().roles.get(&adv.id) { Some(NodeRoleV(Some(cfg))) => { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -102,10 +102,15 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> format_table(healthy_nodes); let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status - .iter() - .any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_))))); + let failure_case_1 = status.iter().any(|adv| { + !adv.is_up + && matches!( + layout.current().roles.get(&adv.id), + Some(NodeRoleV(Some(_))) + ) + }); let failure_case_2 = layout + .current() .roles .items() .iter() @@ -115,7 +120,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut failed_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let tf = timeago::Formatter::new(); failed_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", @@ -132,7 +137,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } } - for (id, _, role_v) in layout.roles.items().iter() { + for (id, _, role_v) in layout.current().roles.items().iter() { if let NodeRoleV(Some(cfg)) = role_v { if !status_keys.contains(id) { failed_nodes.push(format!( diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index ce2b11e0..4a617337 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -58,17 +58,18 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.node_ids().iter().cloned()), + .chain(layout.current().node_ids().iter().cloned()), node_id, ) }) .collect::, _>>()?; - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); for replaced in args.replace.iter() { - let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; + let replaced_node = + find_matching_node(layout.current().node_ids().iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout @@ -149,7 +150,7 @@ pub async fn cmd_remove_role( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); let deleted_node = @@ -174,13 +175,16 @@ pub async fn cmd_show_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); - print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); + print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); println!(); - println!("Current cluster layout version: {}", layout.version); + println!( + "Current cluster layout version: {}", + layout.current().version + ); let has_role_changes = print_staging_role_changes(&layout); if has_role_changes { - let v = layout.version; + let v = layout.current().version; let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions @@ -189,7 +193,7 @@ pub async fn cmd_show_layout( Ok((layout, msg)) => { println!(); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - print_cluster_layout(&layout, "No nodes have a role in the new layout."); + print_cluster_layout(layout.current(), "No nodes have a role in the new layout."); println!(); for line in msg.iter() { @@ -266,11 +270,11 @@ pub async fn cmd_config_layout( .parse::() .ok_or_message("invalid zone redundancy value")?; if let ZoneRedundancy::AtLeast(r_int) = r { - if r_int > layout.replication_factor { + if r_int > layout.current().replication_factor { return Err(Error::Message(format!( "The zone redundancy must be smaller or equal to the \ replication factor ({}).", - layout.replication_factor + layout.current().replication_factor ))); } else if r_int < 1 { return Err(Error::Message( @@ -302,7 +306,7 @@ pub async fn cmd_config_layout( pub async fn fetch_layout( rpc_cli: &Endpoint, rpc_host: NodeID, -) -> Result { +) -> Result { match rpc_cli .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? @@ -315,7 +319,7 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - layout: ClusterLayout, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( @@ -327,7 +331,7 @@ pub async fn send_layout( Ok(()) } -pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { +pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { @@ -366,13 +370,13 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { } } -pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { +pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { let has_role_changes = layout .staging_roles .items() .iter() - .any(|(k, _, v)| layout.roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.parameters; + .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); + let has_layout_changes = *layout.staging_parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); @@ -380,7 +384,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; for (id, _, role) in layout.staging_roles.items().iter() { - if layout.roles.get(id) == Some(role) { + if layout.current().roles.get(id) == Some(role) { continue; } if let Some(role) = &role.0 { -- cgit v1.2.3 From 523d2ecb9511f74e144cd116b942d6c1bf0f546d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 11:19:43 +0100 Subject: layout: use separate CRDT for staged layout changes --- src/garage/cli/cmd.rs | 2 +- src/garage/cli/layout.rs | 47 +++++++++++++++++++++++++++++------------------ src/garage/cli/structs.rs | 6 +++--- 3 files changed, 33 insertions(+), 22 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 8be43873..1a054025 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -85,7 +85,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } _ => { - let new_role = match layout.staging_roles.get(&adv.id) { + let new_role = match layout.staging.get().roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "(pending)", _ => "NO ROLE ASSIGNED", }; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 4a617337..269d92f4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -65,7 +65,7 @@ pub async fn cmd_assign_role( .collect::, _>>()?; let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { let replaced_node = @@ -73,7 +73,9 @@ pub async fn cmd_assign_role( match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -131,7 +133,9 @@ pub async fn cmd_assign_role( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -151,13 +155,15 @@ pub async fn cmd_remove_role( let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -203,16 +209,12 @@ pub async fn cmd_show_layout( println!(); println!(" garage layout apply --version {}", v + 1); println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } Err(e) => { println!("Error while trying to compute the assignment: {}", e); println!("This new layout cannot yet be applied."); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } } } @@ -245,9 +247,15 @@ pub async fn cmd_revert_layout( rpc_host: NodeID, revert_opt: RevertLayoutOpt, ) -> Result<(), Error> { + if !revert_opt.yes { + return Err(Error::Message( + "Please add the --yes flag to run the layout revert operation".into(), + )); + } + let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.revert_staged_changes(revert_opt.version)?; + let layout = layout.revert_staged_changes()?; send_layout(rpc_cli, rpc_host, layout).await?; @@ -284,7 +292,9 @@ pub async fn cmd_config_layout( } layout - .staging_parameters + .staging + .get_mut() + .parameters .update(LayoutParameters { zone_redundancy: r }); println!("The zone redundancy parameter has been set to '{}'.", r); did_something = true; @@ -371,19 +381,20 @@ pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { } pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { - let has_role_changes = layout - .staging_roles + let staging = layout.staging.get(); + let has_role_changes = staging + .roles .items() .iter() .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.current().parameters; + let has_layout_changes = *staging.parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); println!("==== STAGED ROLE CHANGES ===="); if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging_roles.items().iter() { + for (id, _, role) in staging.roles.items().iter() { if layout.current().roles.get(id) == Some(role) { continue; } @@ -406,7 +417,7 @@ pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { if has_layout_changes { println!( "Zone redundancy: {}", - layout.staging_parameters.get().zone_redundancy + staging.parameters.get().zone_redundancy ); } true diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index aba57551..3badc447 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -164,9 +164,9 @@ pub struct ApplyLayoutOpt { #[derive(StructOpt, Debug)] pub struct RevertLayoutOpt { - /// Version number of old configuration to which to revert - #[structopt(long = "version")] - pub(crate) version: Option, + /// The revert operation will not be ran unless this flag is added + #[structopt(long = "yes")] + pub(crate) yes: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] -- cgit v1.2.3 From 19ef1ec8e7fee3a6c670e6e35dfcc83f0801e604 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 13:34:14 +0100 Subject: layout: more refactoring --- src/garage/cli/layout.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 269d92f4..bffc81d3 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use bytesize::ByteSize; use format_table::format_table; @@ -321,7 +323,7 @@ pub async fn fetch_layout( .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { - SystemRpc::AdvertiseClusterLayout(t) => Ok(t), + SystemRpc::AdvertiseClusterLayout(t) => Ok(Arc::try_unwrap(t).unwrap()), resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), } } @@ -334,7 +336,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - SystemRpc::AdvertiseClusterLayout(layout), + SystemRpc::AdvertiseClusterLayout(Arc::new(layout)), PRIO_NORMAL, ) .await??; -- cgit v1.2.3 From bfb1845fdc981a370539d641a5d80f438f184f07 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 14:12:05 +0100 Subject: layout: refactor to use a RwLock on LayoutHistory --- src/garage/cli/layout.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index bffc81d3..269d92f4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use bytesize::ByteSize; use format_table::format_table; @@ -323,7 +321,7 @@ pub async fn fetch_layout( .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { - SystemRpc::AdvertiseClusterLayout(t) => Ok(Arc::try_unwrap(t).unwrap()), + SystemRpc::AdvertiseClusterLayout(t) => Ok(t), resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), } } @@ -336,7 +334,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - SystemRpc::AdvertiseClusterLayout(Arc::new(layout)), + SystemRpc::AdvertiseClusterLayout(layout), PRIO_NORMAL, ) .await??; -- cgit v1.2.3 From bad7cc812ead88e9f334405c5c082d79c14c8898 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:42:10 +0100 Subject: layout admin: add missing calls to update_hash --- src/garage/cli/layout.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 269d92f4..15727448 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -329,8 +329,9 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - layout: LayoutHistory, + mut layout: LayoutHistory, ) -> Result<(), Error> { + layout.update_hashes(); rpc_cli .call( &rpc_host, -- cgit v1.2.3 From 1aab1f4e688ebc3f3adcb41c817c16c688a3291c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:06:16 +0100 Subject: layout: refactoring of all_nodes --- src/garage/cli/layout.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 15727448..0f01a37a 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -49,6 +49,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let all_nodes = layout.all_nodes().into_owned(); let added_nodes = args .node_ids @@ -58,7 +59,7 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.current().node_ids().iter().cloned()), + .chain(all_nodes.iter().cloned()), node_id, ) }) @@ -68,8 +69,7 @@ pub async fn cmd_assign_role( roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { - let replaced_node = - find_matching_node(layout.current().node_ids().iter().cloned(), replaced)?; + let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout -- cgit v1.2.3 From 393c4d4515e0cdadadc8de8ae2df12e4371cff88 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 14:20:50 +0100 Subject: layout: add helper for cached/external values to centralize recomputation --- src/garage/cli/layout.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0f01a37a..51774314 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -49,7 +49,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let all_nodes = layout.all_nodes().into_owned(); + let all_nodes = layout.get_all_nodes(); let added_nodes = args .node_ids @@ -331,7 +331,6 @@ pub async fn send_layout( rpc_host: NodeID, mut layout: LayoutHistory, ) -> Result<(), Error> { - layout.update_hashes(); rpc_cli .call( &rpc_host, -- cgit v1.2.3 From 33c8a489b0a9c0e869282bfc19c548f5a3e02e8c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 15:40:44 +0100 Subject: layou: implement ack locking --- src/garage/cli/layout.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 51774314..0be8278f 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -329,7 +329,7 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - mut layout: LayoutHistory, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( -- cgit v1.2.3 From 539a920313fff010b8a4291aeef58ec9a14ee635 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 13:18:59 +0100 Subject: cli: show when nodes are draining metadata --- src/garage/cli/cmd.rs | 172 +++++++++++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 64 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 1a054025..c99243b9 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::time::Duration; use format_table::format_table; @@ -62,35 +62,69 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.current().roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => { - let data_avail = match &adv.status.data_disk_avail { - _ if cfg.capacity.is_none() => "N/A".into(), - Some((avail, total)) => { - let pct = (*avail as f64) / (*total as f64) * 100.; - let avail = bytesize::ByteSize::b(*avail); - format!("{} ({:.1}%)", avail, pct) - } - None => "?".into(), - }; + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { + let data_avail = match &adv.status.data_disk_avail { + _ if cfg.capacity.is_none() => "N/A".into(), + Some((avail, total)) => { + let pct = (*avail as f64) / (*total as f64) * 100.; + let avail = bytesize::ByteSize::b(*avail); + format!("{} ({:.1}%)", avail, pct) + } + None => "?".into(), + }; + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = cfg.capacity_string(), + data_avail = data_avail, + )); + } else { + let prev_role = layout + .versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + let historic_role = + layout + .old_versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + if let Some(cfg) = prev_role { healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, host = adv.status.hostname, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, - capacity = cfg.capacity_string(), - data_avail = data_avail, )); - } - _ => { + } else if let Some(cfg) = historic_role { + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + )); + } else { let new_role = match layout.staging.get().roles.get(&adv.id) { - Some(NodeRoleV(Some(_))) => "(pending)", + Some(NodeRoleV(Some(_))) => "pending...", _ => "NO ROLE ASSIGNED", }; healthy_nodes.push(format!( - "{id:?}\t{h}\t{addr}\t{new_role}", + "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, h = adv.status.hostname, addr = adv.addr, @@ -101,55 +135,65 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } format_table(healthy_nodes); - let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status.iter().any(|adv| { - !adv.is_up - && matches!( - layout.current().roles.get(&adv.id), - Some(NodeRoleV(Some(_))) - ) - }); - let failure_case_2 = layout - .current() - .roles - .items() + // Determine which nodes are unhealthy and print that to stdout + let status_map = status .iter() - .any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); - if failure_case_1 || failure_case_2 { - println!("\n==== FAILED NODES ===="); - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; - for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { - let tf = timeago::Formatter::new(); - failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - last_seen = adv - .last_seen_secs_ago - .map(|s| tf.convert(Duration::from_secs(s))) - .unwrap_or_else(|| "never seen".into()), - )); + .map(|adv| (adv.id, adv)) + .collect::>(); + + let tf = timeago::Formatter::new(); + let mut failed_nodes = + vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut listed = HashSet::new(); + for ver in layout.versions.iter().rev() { + for (node, _, role) in ver.roles.items().iter() { + let cfg = match role { + NodeRoleV(Some(role)) if role.capacity.is_some() => role, + _ => continue, + }; + + if listed.contains(node) { + continue; } - } - for (id, _, role_v) in layout.current().roles.items().iter() { - if let NodeRoleV(Some(cfg)) = role_v { - if !status_keys.contains(id) { - failed_nodes.push(format!( - "{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", - id = id, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - )); - } + listed.insert(*node); + + let adv = status_map.get(node); + if adv.map(|x| x.is_up).unwrap_or(false) { + continue; } + + // Node is in a layout version, is not a gateway node, and is not up: + // it is in a failed state, add proper line to the output + let (host, addr, last_seen) = match adv { + Some(adv) => ( + adv.status.hostname.as_str(), + adv.addr.to_string(), + adv.last_seen_secs_ago + .map(|s| tf.convert(Duration::from_secs(s))) + .unwrap_or_else(|| "never seen".into()), + ), + None => ("??", "??".into(), "never seen".into()), + }; + let capacity = if ver.version == layout.current().version { + cfg.capacity_string() + } else { + "draining metadata...".to_string() + }; + failed_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + id = node, + host = host, + addr = addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = capacity, + last_seen = last_seen, + )); } + } + + if failed_nodes.len() > 1 { + println!("\n==== FAILED NODES ===="); format_table(failed_nodes); } -- cgit v1.2.3 From 11e6fef93ce3ca56584fc99223b71da77d320dd7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 16:17:41 +0100 Subject: cli: add layout history and layout assume-sync commands --- src/garage/cli/cmd.rs | 14 +++++- src/garage/cli/layout.rs | 111 ++++++++++++++++++++++++++++++++++++++++++++++ src/garage/cli/structs.rs | 16 +++++++ 3 files changed, 139 insertions(+), 2 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c99243b9..08ed00cf 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -135,13 +135,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } format_table(healthy_nodes); - // Determine which nodes are unhealthy and print that to stdout + // Determine which nodes are unhealthy and print that to stdout let status_map = status .iter() .map(|adv| (adv.id, adv)) .collect::>(); let tf = timeago::Formatter::new(); + let mut drain_msg = false; let mut failed_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut listed = HashSet::new(); @@ -163,7 +164,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } // Node is in a layout version, is not a gateway node, and is not up: - // it is in a failed state, add proper line to the output + // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( adv.status.hostname.as_str(), @@ -177,6 +178,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let capacity = if ver.version == layout.current().version { cfg.capacity_string() } else { + drain_msg = true; "draining metadata...".to_string() }; failed_nodes.push(format!( @@ -195,6 +197,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> if failed_nodes.len() > 1 { println!("\n==== FAILED NODES ===="); format_table(failed_nodes); + if drain_msg { + println!(); + println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); + println!("If these nodes are definitely dead, please review the layout history with"); + println!( + "`garage layout history` and use `garage layout assume-sync` to force progress." + ); + } } if print_staging_role_changes(&layout) { diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0be8278f..3c7843bd 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Config(config_opt) => { cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } + LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, + LayoutOperation::AssumeSync(assume_sync_opt) => { + cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await + } } } @@ -311,6 +315,113 @@ pub async fn cmd_config_layout( Ok(()) } +pub async fn cmd_layout_history( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result<(), Error> { + let layout = fetch_layout(rpc_cli, rpc_host).await?; + let min_stored = layout.min_stored(); + + println!("==== LAYOUT HISTORY ===="); + let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()]; + for ver in layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()) + { + let status = if ver.version == layout.current().version { + "current" + } else if ver.version >= min_stored { + "draining" + } else { + "historical" + }; + table.push(format!( + "#{}\t{}\t{}\t{}", + ver.version, + status, + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some())) + .count(), + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none())) + .count(), + )); + } + format_table(table); + + println!(); + println!("==== UPDATE TRACKERS ===="); + println!("This is the internal data that Garage stores to know which nodes have what data."); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node), + layout.update_trackers.sync_map.get(node), + layout.update_trackers.sync_ack_map.get(node), + )); + } + table[1..].sort(); + format_table(table); + + if layout.versions.len() > 1 { + println!(); + println!( + "If some nodes are not catching up to the latest layout version in the update tracker," + ); + println!("it might be because they are offline or unable to complete a sync successfully."); + println!( + "You may force progress using `garage layout assume-sync --version {}`", + layout.current().version + ); + } + + Ok(()) +} + +pub async fn cmd_layout_assume_sync( + rpc_cli: &Endpoint, + rpc_host: NodeID, + opt: AssumeSyncOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + let min_v = layout.min_stored(); + if opt.version <= min_v || opt.version > layout.current().version { + return Err(Error::Message(format!( + "Invalid version, you may use the following version numbers: {}", + (min_v + 1..=layout.current().version) + .map(|x| x.to_string()) + .collect::>() + .join(" ") + ))); + } + + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + layout.update_trackers.ack_map.set_max(*node, opt.version); + layout.update_trackers.sync_map.set_max(*node, opt.version); + layout + .update_trackers + .sync_ack_map + .set_max(*node, opt.version); + } + + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + + Ok(()) +} + // --- utility --- pub async fn fetch_layout( diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 3badc447..c4b400f4 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -112,6 +112,14 @@ pub enum LayoutOperation { /// Revert staged changes to cluster layout #[structopt(name = "revert", version = garage_version())] Revert(RevertLayoutOpt), + + /// View the history of layouts in the cluster + #[structopt(name = "history", version = garage_version())] + History, + + /// Assume all nodes are synchronized up to a certain layout version + #[structopt(name = "assume-sync", version = garage_version())] + AssumeSync(AssumeSyncOpt), } #[derive(StructOpt, Debug)] @@ -169,6 +177,14 @@ pub struct RevertLayoutOpt { pub(crate) yes: bool, } +#[derive(StructOpt, Debug)] +pub struct AssumeSyncOpt { + /// Version number of the layout to assume is currently up-to-date. + /// This will generally be the current layout version. + #[structopt(long = "version")] + pub(crate) version: u64, +} + #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum BucketOperation { /// List buckets -- cgit v1.2.3 From c539077d30809c9d2232aa0fe107a9652dcb7c26 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 16:20:19 +0100 Subject: cli: remove historic layout info from status --- src/garage/cli/cmd.rs | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 08ed00cf..4d1306b6 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -91,15 +91,6 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Some(NodeRoleV(Some(cfg))) => Some(cfg), _ => None, }); - let historic_role = - layout - .old_versions - .iter() - .rev() - .find_map(|x| match x.roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => Some(cfg), - _ => None, - }); if let Some(cfg) = prev_role { healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", @@ -109,15 +100,6 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> tags = cfg.tags.join(","), zone = cfg.zone, )); - } else if let Some(cfg) = historic_role { - healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - )); } else { let new_role = match layout.staging.get().roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "pending...", -- cgit v1.2.3 From c04dd8788a3764da2f307b1d10c2d56b7b0e4a61 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 14:25:04 +0100 Subject: admin: more info in admin GetClusterStatus --- src/garage/cli/cmd.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 4d1306b6..c7f0ad2b 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,6 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { + let host = adv.status.hostname.as_deref().unwrap_or("?"); if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -75,7 +76,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -95,7 +96,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -108,7 +109,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, - h = adv.status.hostname, + h = host, addr = adv.addr, new_role = new_role, )); @@ -149,7 +150,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( - adv.status.hostname.as_str(), + adv.status.hostname.as_deref().unwrap_or("?"), adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) -- cgit v1.2.3 From aa59059a910eb6e1e824b84413a66909d697ef8a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 11:50:00 +0100 Subject: layout cli: safer skip-dead-nodes command --- src/garage/cli/cmd.rs | 23 ++++++++++++++++------- src/garage/cli/layout.rs | 35 +++++++++++++++++++++++++---------- src/garage/cli/structs.rs | 12 ++++++++---- 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c7f0ad2b..196c0cb3 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -49,13 +49,7 @@ pub async fn cli_command_dispatch( } pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Result<(), Error> { - let status = match rpc_cli - .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) - .await?? - { - SystemRpc::ReturnKnownNodes(nodes) => nodes, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; + let status = fetch_status(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== HEALTHY NODES ===="); @@ -268,3 +262,18 @@ pub async fn cmd_admin( } Ok(()) } + +// ---- utility ---- + +pub async fn fetch_status( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result, Error> { + match rpc_cli + .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) + .await?? + { + SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), + resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + } +} diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 3c7843bd..cdf77c04 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -33,8 +33,8 @@ pub async fn cli_layout_command_dispatch( cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, - LayoutOperation::AssumeSync(assume_sync_opt) => { - cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await + LayoutOperation::SkipDeadNodes(assume_sync_opt) => { + cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await } } } @@ -388,13 +388,21 @@ pub async fn cmd_layout_history( Ok(()) } -pub async fn cmd_layout_assume_sync( +pub async fn cmd_layout_skip_dead_nodes( rpc_cli: &Endpoint, rpc_host: NodeID, - opt: AssumeSyncOpt, + opt: SkipDeadNodesOpt, ) -> Result<(), Error> { + let status = fetch_status(rpc_cli, rpc_host).await?; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + if layout.versions.len() == 1 { + return Err(Error::Message( + "This command cannot be called when there is only one live cluster layout version" + .into(), + )); + } + let min_v = layout.min_stored(); if opt.version <= min_v || opt.version > layout.current().version { return Err(Error::Message(format!( @@ -408,12 +416,19 @@ pub async fn cmd_layout_assume_sync( let all_nodes = layout.get_all_nodes(); for node in all_nodes.iter() { - layout.update_trackers.ack_map.set_max(*node, opt.version); - layout.update_trackers.sync_map.set_max(*node, opt.version); - layout - .update_trackers - .sync_ack_map - .set_max(*node, opt.version); + if status.iter().any(|x| x.id == *node && x.is_up) { + continue; + } + + if layout.update_trackers.ack_map.set_max(*node, opt.version) { + println!("Increased the ACK tracker for node {:?}", node); + } + + if opt.allow_missing_data { + if layout.update_trackers.sync_map.set_max(*node, opt.version) { + println!("Increased the SYNC tracker for node {:?}", node); + } + } } send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index c4b400f4..6bc3da22 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -117,9 +117,9 @@ pub enum LayoutOperation { #[structopt(name = "history", version = garage_version())] History, - /// Assume all nodes are synchronized up to a certain layout version - #[structopt(name = "assume-sync", version = garage_version())] - AssumeSync(AssumeSyncOpt), + /// Skip dead nodes when awaiting for a new layout version to be synchronized + #[structopt(name = "skip-dead-nodes", version = garage_version())] + SkipDeadNodes(SkipDeadNodesOpt), } #[derive(StructOpt, Debug)] @@ -178,11 +178,15 @@ pub struct RevertLayoutOpt { } #[derive(StructOpt, Debug)] -pub struct AssumeSyncOpt { +pub struct SkipDeadNodesOpt { /// Version number of the layout to assume is currently up-to-date. /// This will generally be the current layout version. #[structopt(long = "version")] pub(crate) version: u64, + /// Allow the skip even if a quorum of ndoes could not be found for + /// the data among the remaining nodes + #[structopt(long = "allow-missing-data")] + pub(crate) allow_missing_data: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] -- cgit v1.2.3 From 9cecea64d4509e95ac9793b29c947e2ecf9bb0b8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 14:27:53 +0100 Subject: layout: allow sync update tracker to progress with only quorums --- src/garage/cli/layout.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index cdf77c04..fac826f5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -365,9 +365,9 @@ pub async fn cmd_layout_history( table.push(format!( "{:?}\t#{}\t#{}\t#{}", node, - layout.update_trackers.ack_map.get(node), - layout.update_trackers.sync_map.get(node), - layout.update_trackers.sync_ack_map.get(node), + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), )); } table[1..].sort(); -- cgit v1.2.3 From 7f2541101f15614c79020b35d3d7dab767c32676 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 11:24:23 +0100 Subject: cli: improvements to the layout commands when multiple layouts are live --- src/garage/cli/cmd.rs | 4 +-- src/garage/cli/layout.rs | 67 +++++++++++++++++++++++++++++++----------------- src/garage/cli/util.rs | 4 ++- 3 files changed, 48 insertions(+), 27 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 196c0cb3..fb6dface 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -179,7 +179,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); println!("If these nodes are definitely dead, please review the layout history with"); println!( - "`garage layout history` and use `garage layout assume-sync` to force progress." + "`garage layout history` and use `garage layout skip-dead-nodes` to force progress." ); } } @@ -274,6 +274,6 @@ pub async fn fetch_status( .await?? { SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index fac826f5..f76e33c5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -354,35 +354,44 @@ pub async fn cmd_layout_history( )); } format_table(table); - - println!(); - println!("==== UPDATE TRACKERS ===="); - println!("This is the internal data that Garage stores to know which nodes have what data."); println!(); - let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; - let all_nodes = layout.get_all_nodes(); - for node in all_nodes.iter() { - table.push(format!( - "{:?}\t#{}\t#{}\t#{}", - node, - layout.update_trackers.ack_map.get(node, min_stored), - layout.update_trackers.sync_map.get(node, min_stored), - layout.update_trackers.sync_ack_map.get(node, min_stored), - )); - } - table[1..].sort(); - format_table(table); if layout.versions.len() > 1 { + println!("==== UPDATE TRACKERS ===="); + println!("Several layout versions are currently live in the version, and data is being migrated."); + println!( + "This is the internal data that Garage stores to know which nodes have what data." + ); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), + )); + } + table[1..].sort(); + format_table(table); + println!(); println!( - "If some nodes are not catching up to the latest layout version in the update tracker," + "If some nodes are not catching up to the latest layout version in the update trackers," ); println!("it might be because they are offline or unable to complete a sync successfully."); println!( - "You may force progress using `garage layout assume-sync --version {}`", + "You may force progress using `garage layout skip-dead-nodes --version {}`", layout.current().version ); + } else { + println!("Your cluster is currently in a stable state with a single live layout version."); + println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,"); + println!( + "so you might want to keep old nodes online until their data directories become empty." + ); } Ok(()) @@ -415,6 +424,7 @@ pub async fn cmd_layout_skip_dead_nodes( } let all_nodes = layout.get_all_nodes(); + let mut did_something = false; for node in all_nodes.iter() { if status.iter().any(|x| x.id == *node && x.is_up) { continue; @@ -422,19 +432,28 @@ pub async fn cmd_layout_skip_dead_nodes( if layout.update_trackers.ack_map.set_max(*node, opt.version) { println!("Increased the ACK tracker for node {:?}", node); + did_something = true; } if opt.allow_missing_data { if layout.update_trackers.sync_map.set_max(*node, opt.version) { println!("Increased the SYNC tracker for node {:?}", node); + did_something = true; } } } - send_layout(rpc_cli, rpc_host, layout).await?; - println!("Success."); - - Ok(()) + if did_something { + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + Ok(()) + } else if !opt.allow_missing_data { + Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into())) + } else { + Err(Error::Message( + "Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(), + )) + } } // --- utility --- @@ -448,7 +467,7 @@ pub async fn fetch_layout( .await?? { SystemRpc::AdvertiseClusterLayout(t) => Ok(t), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 2232d395..0511e2b1 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -450,6 +450,8 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); - println!("Warning: refcount does not match number of non-deleted versions"); + println!( + "Warning: refcount does not match number of non-deleted versions (see issue #644)." + ); } } -- cgit v1.2.3 From f537f76681760e9b2b3cc095a6031ebb59ca4733 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 13:24:47 +0100 Subject: [rm-migration] Remove migration path from Garage v0.5 --- src/garage/cli/cmd.rs | 3 --- src/garage/cli/structs.rs | 22 ---------------------- 2 files changed, 25 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index fb6dface..7440457f 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -33,9 +33,6 @@ pub async fn cli_command_dispatch( Command::Key(ko) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::KeyOperation(ko)).await } - Command::Migrate(mo) => { - cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Migrate(mo)).await - } Command::Repair(ro) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 40e47ee1..63014dbc 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -31,11 +31,6 @@ pub enum Command { #[structopt(name = "key", version = garage_version())] Key(KeyOperation), - /// Run migrations from previous Garage version - /// (DO NOT USE WITHOUT READING FULL DOCUMENTATION) - #[structopt(name = "migrate", version = garage_version())] - Migrate(MigrateOpt), - /// Start repair of node data on remote node #[structopt(name = "repair", version = garage_version())] Repair(RepairOpt), @@ -445,23 +440,6 @@ pub struct KeyImportOpt { pub yes: bool, } -#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] -pub struct MigrateOpt { - /// Confirm the launch of the migrate operation - #[structopt(long = "yes")] - pub yes: bool, - - #[structopt(subcommand)] - pub what: MigrateWhat, -} - -#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] -pub enum MigrateWhat { - /// Migrate buckets and permissions from v0.5.0 - #[structopt(name = "buckets050", version = garage_version())] - Buckets050, -} - #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] pub struct RepairOpt { /// Launch repair operation on all nodes -- cgit v1.2.3 From 44454aac012cbef9158110f2352301ffcfaf31c7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 14:11:02 +0100 Subject: [rm-sled] Remove the Sled database engine --- src/garage/cli/convert_db.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/convert_db.rs b/src/garage/cli/convert_db.rs index 2aadb1d6..5346d55a 100644 --- a/src/garage/cli/convert_db.rs +++ b/src/garage/cli/convert_db.rs @@ -11,7 +11,7 @@ pub struct ConvertDbOpt { /// https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0) #[structopt(short = "i")] input_path: PathBuf, - /// Input database engine (sled, lmdb or sqlite; limited by db engines + /// Input database engine (lmdb or sqlite; limited by db engines /// enabled in this build) #[structopt(short = "a")] input_engine: Engine, -- cgit v1.2.3 From 05c92204ecab87540806073ac4deedfd58519240 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 14:59:56 +0100 Subject: [rm-sled] Remove counted_tree_hack --- src/garage/cli/structs.rs | 4 ---- 1 file changed, 4 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 40e47ee1..7e7ab71b 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -553,10 +553,6 @@ pub struct StatsOpt { #[structopt(short = "a", long = "all-nodes")] pub all_nodes: bool, - /// Gather detailed statistics (this can be long) - #[structopt(short = "d", long = "detailed")] - pub detailed: bool, - /// Don't show global cluster stats (internal use in RPC) #[structopt(skip)] #[serde(default)] -- cgit v1.2.3 From dc0b78cdb88e9cbfd7dc1a2ee0b15333939be549 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 11:04:20 +0100 Subject: [block-ref-repair] Block refcount recalculation and repair - We always recalculate the reference count of a block before deleting it locally, to make sure that it is indeed zero. - If we had to fetch a remote block but we were not able to get it, check that refcount is indeed > 0. - Repair procedure that checks everything --- src/garage/cli/structs.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 1f572a9a..8380b5e2 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -473,8 +473,11 @@ pub enum RepairWhat { #[structopt(name = "mpu", version = garage_version())] MultipartUploads, /// Repropagate version deletions to the block ref table - #[structopt(name = "block_refs", version = garage_version())] + #[structopt(name = "block-refs", version = garage_version())] BlockRefs, + /// Recalculate block reference counters + #[structopt(name = "block-rc", version = garage_version())] + BlockRc, /// Verify integrity of all blocks on disc #[structopt(name = "scrub", version = garage_version())] Scrub { -- cgit v1.2.3 From 3eab639c146f67fc67534633ae26c9aec116327d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 16:24:34 +0100 Subject: [block-ref-repair] mention `garage block repair-rc` in documentation --- src/garage/cli/util.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 0511e2b1..21c14f42 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -451,7 +451,7 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); println!( - "Warning: refcount does not match number of non-deleted versions (see issue #644)." + "Warning: refcount does not match number of non-deleted versions, you should try `garage repair block-rc`." ); } } -- cgit v1.2.3 From 961b4f9af36a7fb5d3a661ac19e8f2c168bb48ae Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 10:45:34 +0100 Subject: [net-fixes] fix issues with local peer address (fix #761) --- src/garage/cli/cmd.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'src/garage/cli') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index a84061a7..44d3d96c 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -57,6 +57,10 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { let host = adv.status.hostname.as_deref().unwrap_or("?"); + let addr = match adv.addr { + Some(addr) => addr.to_string(), + None => "N/A".to_string(), + }; if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -71,7 +75,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, host = host, - addr = adv.addr, + addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, capacity = cfg.capacity_string(), @@ -91,7 +95,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, host = host, - addr = adv.addr, + addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, )); @@ -104,7 +108,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, h = host, - addr = adv.addr, + addr = addr, new_role = new_role, )); } @@ -120,8 +124,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let tf = timeago::Formatter::new(); let mut drain_msg = false; - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut failed_nodes = vec!["ID\tHostname\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut listed = HashSet::new(); for ver in layout.versions.iter().rev() { for (node, _, role) in ver.roles.items().iter() { @@ -142,15 +145,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // Node is in a layout version, is not a gateway node, and is not up: // it is in a failed state, add proper line to the output - let (host, addr, last_seen) = match adv { + let (host, last_seen) = match adv { Some(adv) => ( adv.status.hostname.as_deref().unwrap_or("?"), - adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) .unwrap_or_else(|| "never seen".into()), ), - None => ("??", "??".into(), "never seen".into()), + None => ("??", "never seen".into()), }; let capacity = if ver.version == layout.current().version { cfg.capacity_string() @@ -159,10 +161,9 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "draining metadata...".to_string() }; failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + "{id:?}\t{host}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", id = node, host = host, - addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, capacity = capacity, -- cgit v1.2.3